From adf80da6a0942d0e5f54ab1c3bbc12d3bd366dc5 Mon Sep 17 00:00:00 2001 From: Andy C Date: Fri, 10 Jan 2025 12:07:46 -0500 Subject: [PATCH] [lazylex/html] Able to parse all my XML test cases! This wasn't hard. There are some unicode differences, but we're simplifying things. --- lazylex/html.py | 58 +++++++++++++++++++++++++------------------- lazylex/html_test.py | 17 +++++++++++++ 2 files changed, 50 insertions(+), 25 deletions(-) diff --git a/lazylex/html.py b/lazylex/html.py index 7dadcfb21..e6175c204 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -140,40 +140,30 @@ def MakeLexer(rules): # Tag name, or attribute name # colon is used in XML -_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter - -LEXER = [ - # Note non-greedy matches are regular and can be matched in linear time - # with RE2. - # - # https://news.ycombinator.com/item?id=27099798 - # - # Maybe try combining all of these for speed. - # . is any char except newline - # https://re2c.org/manual/manual_c.html +# https://www.w3.org/TR/xml/#NT-Name +# Hm there is a lot of unicode stuff. We are simplifying parsing - # Discarded options - #(r'', Tok.Comment), +_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter - # Hack from Claude: \s\S instead of re.DOTALL. I don't like this - #(r'', Tok.Comment), - #(r'', Tok.Comment), +LEXER = [ (r'', Tok.Comment), + +# Hack from Claude: \s\S instead of re.DOTALL. I don't like this +#(r'', Tok.Comment), +#(r'', Tok.Comment), LEXER = MakeLexer(LEXER) diff --git a/lazylex/html_test.py b/lazylex/html_test.py index de4a1c635..ffeac1878 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -190,6 +190,23 @@ def testCData(self): (Tok.EndOfStream, 71), ], tokens) + def testEntity(self): + Tok = html.Tok + + # from + # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml + h = '&ent1;, &ent2;!' + + tokens = Lex(h) + + self.assertEqual([ + (Tok.CharEntity, 6), + (Tok.RawData, 8), + (Tok.CharEntity, 14), + (Tok.RawData, 15), + (Tok.EndOfStream, 15), + ], tokens) + def testStartTag(self): Tok = html.Tok