diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh index 3b77060bd..cd67ee83a 100755 --- a/data_lang/htm8-test.sh +++ b/data_lang/htm8-test.sh @@ -133,7 +133,7 @@ test-site() { popd } -readonly SOIL_ID=8917 +readonly SOIL_ID=8924 readonly WWZ_DIR=_tmp/$SOIL_ID sync-wwz() { diff --git a/lazylex/html.py b/lazylex/html.py index 631a73926..6a5341eea 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -190,7 +190,15 @@ def MakeLexer(rules): _NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter -LEXER = [ +CHAR_LEX = [ + # Characters + # https://www.w3.org/TR/xml/#sec-references + (r'&\# [0-9]+ ;', Tok.DecChar), + (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar), + (r'& %s ;' % _NAME, Tok.CharEntity), +] + +LEXER = CHAR_LEX + [ (r'