[lazylex/html] Special lexing rules for <script> and <style>

This works well on our corpus! Also started testing XML. Add a couple TODOs: - tighten up end tags - disallow unescaped <
oils-for-unix · Jan 10, 2025 · 5ec0ce7 · 5ec0ce7
1 parent bafbe3b
commit 5ec0ce7
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 29 deletions.
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
@@ -103,6 +103,22 @@ test-wwz() {
   popd
 }
 
+find-xml() {
+  time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
+}
+
+test-other-xml() {
+  # problem with &ent1;
+  # CDATA support!  haha OK
+  time cat _tmp/xml-files.txt | $REPO_ROOT/$0 ht8-tool well-formed
+}
+
+test-repo-xml() {
+  # OK these parse
+  time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
+    | $REPO_ROOT/$0 ht8-tool well-formed
+}
+
 # OK we have to skip the <script> tag!  And <style>
 #
 # document.location = '#' + params.join('&');

diff --git a/lazylex/html.py b/lazylex/html.py
@@ -17,7 +17,7 @@
 import sys
 
 if sys.version_info.major == 2:
-    from typing import List, Tuple
+    from typing import List, Tuple, Optional
 
 
 def log(msg, *args):
@@ -81,7 +81,7 @@ def Print(self, s):
 
 # HTML Tokens
 # CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
-TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData CDataStartTag CDataEndTag Invalid EndOfStream'.split(
+TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData Invalid EndOfStream'.split(
 )
 
 
@@ -174,31 +174,28 @@ def MakeLexer(rules):
     #(r'<(?:script|style) [^>]+>', Tok.CDataStartTag),  # start <a>
 
     # Notes:
-    # - We look for a valid tag name, but we don't validate attributes. 
+    # - We look for a valid tag name, but we don't validate attributes.
     #   That's done in the tag lexer.
     # - We don't allow leading whitespace
-    #
-    # TODO: do something different for <script> and <style>.  And maybe have a
-    # mode to also understand the difference between <pre> <textarea> and say
-    # <div>.
-    (r'</ (%s) [^>]* >' % _NAME, Tok.EndTag),  # self-closing <br/>  comes FIRST
+    (r'</ (%s) [^>]* >' % _NAME, Tok.EndTag),
+    # self-closing <br/>  comes before StarttTag
     (r'<  (%s) [^>]* />' % _NAME, Tok.StartEndTag),  # end </a>
     (r'<  (%s) [^>]* >' % _NAME, Tok.StartTag),  # start <a>
-
     (r'&\# [0-9]+ ;', Tok.DecChar),
     (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
     (r'& [a-zA-Z]+ ;', Tok.CharEntity),
 
     # HTML5 allows > in raw data - should we?  But < is not allowed.
     # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
-    #
-    # TODO: I think we should disallow it, like XML does.  There should be "one
-    # way to do it".  There is a stronger distinction between <script> <style>
-    # this way.
     (r'[^&<]+', Tok.RawData),
     (r'.', Tok.Invalid),  # error!
 ]
 
+# TODO:
+# - I think we should unescaped <, like XML does.  There should be "one way to
+#   do it", and it should catch bugs
+# - end tags shouldn't allow any other data, it has to be </foo>, not </foo x=y>
+
 LEXER = MakeLexer(LEXER)
 
 
@@ -210,6 +207,9 @@ def __init__(self, s, left_pos=0, right_pos=-1):
         self.right_pos = len(s) if right_pos == -1 else right_pos
         self.cache = {}  # string -> compiled regex pattern object
 
+        # either </script> or </style> - we search until we see that
+        self.search_state = None  # type: Optional[str]
+
     def _Peek(self):
         # type: () -> Tuple[int, int]
         """
@@ -220,6 +220,15 @@ def _Peek(self):
 
         assert self.pos < self.right_pos, self.pos
 
+        if self.search_state is not None:
+            pos = self.s.find(self.search_state, self.pos)
+            if pos == -1:
+                # unterminated <script> or <style>
+                raise LexError(self.s, self.pos)
+            self.search_state = None
+            # beginning
+            return Tok.CData, pos
+
         # Find the first match.
         # Note: frontend/match.py uses _LongestMatch(), which is different!
         # TODO: reconcile them.  This lexer should be expressible in re2c.
@@ -243,14 +252,12 @@ def _Peek(self):
                         raise LexError(self.s, self.pos)
                     return Tok.Processing, pos + 2  # ?>
 
-                # TODO: we need to enter state so the NEXT call can be CData
-                # And then the one after that must be CDataEndTag.
-                if tok_id == Tok.CDataStartTag:
-                    end_tag = '</script>'
-                    pos = self.s.find(end_tag, self.pos)
-                    if pos == -1:
-                        # unterminated </script>
-                        raise LexError(self.s, self.pos)
+                if tok_id == Tok.StartTag:
+                    tag_name = m.group(1)  # captured
+                    if tag_name == 'script':
+                        self.search_state = '</script>'
+                    elif tag_name == 'style':
+                        self.search_state = '</style>'
 
                 return tok_id, m.end()
         else:

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -179,26 +179,22 @@ def testScriptStyle(self):
         self.assertEqual(12, pos)
         self.assertEqual(Tok.RawData, tok_id)
 
-        return
-
         # <script>
         tok_id, pos = next(lex)
         self.assertEqual(27, pos)
-        self.assertEqual(Tok.CDataStartTag, tok_id)
-
-        return
+        self.assertEqual(Tok.StartTag, tok_id)
 
         # JavaScript code is CData
         tok_id, pos = next(lex)
-        self.assertEqual(34, pos)
         log('tok %r', html.TokenName(tok_id))
+        self.assertEqual(78, pos)
         self.assertEqual(Tok.CData, tok_id)
 
         # </script>
         tok_id, pos = next(lex)
-        self.assertEqual(27, pos)
         log('tok %r', html.TokenName(tok_id))
-        self.assertEqual(Tok.CDataEndTag, tok_id)
+        self.assertEqual(87, pos)
+        self.assertEqual(Tok.EndTag, tok_id)
 
     def testValid(self):
         Tok = html.Tok