From 39ea4325fb0a294a2f2d96facb431f792c33168a Mon Sep 17 00:00:00 2001 From: Andy C Date: Tue, 7 Jan 2025 00:57:41 -0500 Subject: [PATCH] [lazylex] Using faster mystr.find() instead of regex - No re.DOTALL - No non-greedy match --- data_lang/htm8-test.sh | 3 ++- lazylex/html.py | 34 +++++++++++++++++++++++----------- lazylex/html_test.py | 20 ++++++++++++++++++++ 3 files changed, 45 insertions(+), 12 deletions(-) diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh index 548ac3908..9a3f860d5 100755 --- a/data_lang/htm8-test.sh +++ b/data_lang/htm8-test.sh @@ -33,6 +33,7 @@ ht8-tool() { test-well-formed() { cat >_tmp/bad.html <') and - # .find('?>') - - # Actually non-greedy matches are regular and can be matched in linear time + # Note non-greedy matches are regular and can be matched in linear time # with RE2. # # https://news.ycombinator.com/item?id=27099798 @@ -153,11 +149,13 @@ def MakeLexer(rules): # . is any char except newline # https://re2c.org/manual/manual_c.html + # Discarded options + #(r'', Tok.Comment), + # Hack from Claude: \s\S instead of re.DOTALL. I don't like this #(r'', Tok.Comment), - (r'', Tok.Comment), - - #(r'', Tok.Comment), + #(r'', Tok.Comment), + (r'', self.pos) + if pos == -1: + # unterminated + + if tok_id == Tok.ProcessingBegin: + pos = self.s.find('?>', self.pos) + if pos == -1: + # unterminated + return tok_id, m.end() else: raise AssertionError('Tok.Invalid rule should have matched') diff --git a/lazylex/html_test.py b/lazylex/html_test.py index 14664f160..ed67a1c10 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -218,6 +218,26 @@ def testInvalid(self): else: self.fail('Expected LexError') + # Comment + lex = html.ValidTokens('