[lazylex] Using faster mystr.find() instead of regex

- No re.DOTALL - No non-greedy match
oils-for-unix · Jan 7, 2025 · 39ea432 · 39ea432
1 parent 445b0d9
commit 39ea432
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 12 deletions.
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
@@ -33,6 +33,7 @@ ht8-tool() {
 
 test-well-formed() {
   cat >_tmp/bad.html <<EOF
+unfinished <!--
 hi && bye
 EOF
   echo '_tmp/bad.html' | ht8-tool well-formed 
@@ -94,7 +95,7 @@ tree-wwz() {
   tree $WWZ_DIR
 }
 
-check-wwz() {
+test-wwz() {
   pushd $WWZ_DIR
 
   find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed

diff --git a/lazylex/html.py b/lazylex/html.py
@@ -80,7 +80,8 @@ def Print(self, s):
 
 
 # HTML Tokens
-TOKENS = 'Decl Comment Processing StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
+# CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
+TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
 )
 
 
@@ -91,8 +92,6 @@ class Tok(object):
     pass
 
 
-assert len(TOKENS) == 12, TOKENS
-
 TOKEN_NAMES = [None] * len(TOKENS)  # type: List[str]
 
 this_module = sys.modules[__name__]
@@ -140,10 +139,7 @@ def MakeLexer(rules):
 # EntityRef = / '&' dot{* N} ';' /
 
 LEXER = [
-    # TODO: instead of nongreedy matches, the loop can just do .find('-->') and
-    # .find('?>')
-
-    # Actually non-greedy matches are regular and can be matched in linear time
+    # Note non-greedy matches are regular and can be matched in linear time
     # with RE2.
     #
     # https://news.ycombinator.com/item?id=27099798
@@ -153,11 +149,13 @@ def MakeLexer(rules):
     # . is any char except newline
     # https://re2c.org/manual/manual_c.html
 
+    # Discarded options
+    #(r'<!-- .*? -->', Tok.Comment),
+
     # Hack from Claude: \s\S instead of re.DOTALL.  I don't like this
     #(r'<!-- [\s\S]*? -->', Tok.Comment),
-    (r'<!-- (?:.|[\n])*? -->', Tok.Comment),
-
-    #(r'<!-- .*? -->', Tok.Comment),
+    #(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
+    (r'<!--', Tok.CommentBegin),
 
     # Processing instruction are XML only, but they are treated like a comment
     # in HTML:
@@ -166,7 +164,7 @@ def MakeLexer(rules):
     #
     # We don't want to confuse them with start tags, so we recognize them at
     # the top level.
-    (r'<\? (?:.|\n)*? \?>', Tok.Processing),
+    (r'<\?', Tok.ProcessingBegin),
 
     # NOTE: < is allowed in these.
     (r'<! [^>]+ >', Tok.Decl),  # <!DOCTYPE html>
@@ -213,6 +211,20 @@ def _Peek(self):
         for pat, tok_id in LEXER:
             m = pat.match(self.s, self.pos)
             if m:
+                if tok_id == Tok.CommentBegin:
+                    pos = self.s.find('-->', self.pos)
+                    if pos == -1:
+                        # unterminated <!--
+                        raise LexError(self.s, self.pos)
+                    return Tok.Comment, pos + 3  # -->
+
+                if tok_id == Tok.ProcessingBegin:
+                    pos = self.s.find('?>', self.pos)
+                    if pos == -1:
+                        # unterminated <?
+                        raise LexError(self.s, self.pos)
+                    return Tok.Processing, pos + 2  # ?>
+
                 return tok_id, m.end()
         else:
             raise AssertionError('Tok.Invalid rule should have matched')

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -218,6 +218,26 @@ def testInvalid(self):
         else:
             self.fail('Expected LexError')
 
+        # Comment
+        lex = html.ValidTokens('<!-- unfinished comment')
+
+        try:
+            tok_id, pos = next(lex)
+        except html.LexError as e:
+            print(e)
+        else:
+            self.fail('Expected LexError')
+
+        # Processing
+        lex = html.ValidTokens('<? unfinished processing')
+
+        try:
+            tok_id, pos = next(lex)
+        except html.LexError as e:
+            print(e)
+        else:
+            self.fail('Expected LexError')
+
 
 if __name__ == '__main__':
     unittest.main()