[lazylex] Working on <script> <style>

The data inside them is CDATA. This is actually a lexer mode ... We just skip everything until </script> </style>. There can't be more tags between them. We also allow unescaped < > &, although we also have to respect <script> too. I noticed that > is allowed in raw data, but not <. Hm.
oils-for-unix · Jan 7, 2025 · 09e8d9a · 09e8d9a
1 parent 39ea432
commit 09e8d9a
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 34 deletions.
diff --git a/lazylex/html.py b/lazylex/html.py
@@ -81,7 +81,7 @@ def Print(self, s):
 
 # HTML Tokens
 # CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
-TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
+TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData CDataStartTag CDataEndTag Invalid EndOfStream'.split(
 )
 
 
@@ -168,14 +168,17 @@ def MakeLexer(rules):
 
     # NOTE: < is allowed in these.
     (r'<! [^>]+ >', Tok.Decl),  # <!DOCTYPE html>
+    (r'<(?:script|style) [^>]+>', Tok.CDataStartTag),  # start <a>
     (r'</ [^>]+ >', Tok.EndTag),  # self-closing <br/>  comes FIRST
     (r'< [^>]+ />', Tok.StartEndTag),  # end </a>
     (r'< [^>]+  >', Tok.StartTag),  # start <a>
     (r'&\# [0-9]+ ;', Tok.DecChar),
     (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
     (r'& [a-zA-Z]+ ;', Tok.CharEntity),
 
-    # Note: > is allowed in raw data.
+    # HTML5 allows > in raw data - should we?  It's apparently not allowed in
+    # XML.
+    # But < is not allowed.
     # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
     (r'[^&<]+', Tok.RawData),
     (r'.', Tok.Invalid),  # error!
@@ -225,6 +228,15 @@ def _Peek(self):
                         raise LexError(self.s, self.pos)
                     return Tok.Processing, pos + 2  # ?>
 
+                # TODO: we need to enter state so the NEXT call can be CData
+                # And then the one after that must be CDataEndTag.
+                if tok_id == Tok.CDataStartTag:
+                    end_tag = '</script>'
+                    pos = self.s.find(end_tag, self.pos)
+                    if pos == -1:
+                        # unterminated </script>
+                        raise LexError(self.s, self.pos)
+
                 return tok_id, m.end()
         else:
             raise AssertionError('Tok.Invalid rule should have matched')

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -165,6 +165,39 @@ def testProcessingInstruction(self):
         log('tok %r', html.TokenName(tok_id))
         self.assertEqual(Tok.EndOfStream, tok_id)
 
+    def testScriptStyle(self):
+
+        Tok = html.Tok
+        h = '''
+        hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
+        </script>
+        '''
+        print(repr(h))
+        lex = html.ValidTokens(h)
+
+        tok_id, pos = next(lex)
+        self.assertEqual(12, pos)
+        self.assertEqual(Tok.RawData, tok_id)
+
+        # <script>
+        tok_id, pos = next(lex)
+        self.assertEqual(27, pos)
+        self.assertEqual(Tok.CDataStartTag, tok_id)
+
+        return
+
+        # JavaScript code is CData
+        tok_id, pos = next(lex)
+        self.assertEqual(34, pos)
+        log('tok %r', html.TokenName(tok_id))
+        self.assertEqual(Tok.CData, tok_id)
+
+        # </script>
+        tok_id, pos = next(lex)
+        self.assertEqual(27, pos)
+        log('tok %r', html.TokenName(tok_id))
+        self.assertEqual(Tok.CDataEndTag, tok_id)
+
     def testValid(self):
         Tok = html.Tok
 
@@ -205,38 +238,25 @@ def testValid(self):
     def testInvalid(self):
         Tok = html.Tok
 
-        lex = html.ValidTokens('<a>&')
-
-        tok_id, pos = next(lex)
-        self.assertEqual(3, pos)
-        self.assertEqual(Tok.StartTag, tok_id)
-
-        try:
-            tok_id, pos = next(lex)
-        except html.LexError as e:
-            print(e)
-        else:
-            self.fail('Expected LexError')
-
-        # Comment
-        lex = html.ValidTokens('<!-- unfinished comment')
-
-        try:
-            tok_id, pos = next(lex)
-        except html.LexError as e:
-            print(e)
-        else:
-            self.fail('Expected LexError')
-
-        # Processing
-        lex = html.ValidTokens('<? unfinished processing')
-
-        try:
-            tok_id, pos = next(lex)
-        except html.LexError as e:
-            print(e)
-        else:
-            self.fail('Expected LexError')
+        INVALID = [
+            # Should be &amp;
+            '<a>&',
+            # Hm > is allowed?
+            #'a > b',
+            'a < b',
+            '<!-- unfinished comment',
+            '<? unfinished processing',
+        ]
+
+        for s in INVALID:
+            lex = html.ValidTokens(s)
+            try:
+                for i in xrange(10):
+                    tok_id, pos = next(lex)
+            except html.LexError as e:
+                print(e)
+            else:
+                self.fail('Expected LexError')
 
 
 if __name__ == '__main__':