[lazylex/html] Canonicalize tag names to lower case

- For tag matching - For VOID element Still have the problem to find </SCRIPT> or </SCRipt>. Python HTMLParser.py has set_cdata_mode(), which does DYNAMIC compilation of regexes. We don't want to / can't really do that in C++.
oils-for-unix · Jan 13, 2025 · fb42513 · fb42513
1 parent 07ebeba
commit fb42513
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 14 deletions.
diff --git a/lazylex/html.py b/lazylex/html.py
@@ -282,6 +282,7 @@ def _Peek(self):
         assert self.pos < self.right_pos, self.pos
 
         if self.search_state is not None and not self.no_special_tags:
+            # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
             pos = self.s.find(self.search_state, self.pos)
             if pos == -1:
                 # unterminated <script> or <style>
@@ -341,17 +342,22 @@ def TagNameEquals(self, expected):
         assert self.tag_pos_left != -1, self.tag_pos_left
         assert self.tag_pos_right != -1, self.tag_pos_right
 
-        # TODO: In C++, this does not need an allocation
-        # TODO: conditionally lower() case here (maybe not in XML mode)
-        return expected == self.s[self.tag_pos_left:self.tag_pos_right]
+        # TODO: In C++, this does not need an allocation.  Can we test
+        # directly?
+        return expected == self.CanonicalTagName()
 
-    def TagName(self):
+    def CanonicalTagName(self):
         # type: () -> None
         assert self.tag_pos_left != -1, self.tag_pos_left
         assert self.tag_pos_right != -1, self.tag_pos_right
 
-        # TODO: conditionally lower() case here (maybe not in XML mode)
-        return self.s[self.tag_pos_left:self.tag_pos_right]
+        tag_name = self.s[self.tag_pos_left:self.tag_pos_right]
+        # Most tags are already lower case, so avoid allocation with this conditional
+        # TODO: this could go in the mycpp runtime?
+        if tag_name.islower():
+            return tag_name
+        else:
+            return tag_name.lower()
 
     def Read(self):
         # type: () -> Tuple[int, int]
@@ -866,7 +872,7 @@ def Validate(contents, flags, counters):
             counters.debug_attrs.extend(all_attrs)
 
             if flags & BALANCED_TAGS:
-                tag_name = lx.TagName()
+                tag_name = lx.CanonicalTagName()
                 if flags & NO_SPECIAL_TAGS:
                     tag_stack.append(tag_name)
                 else:
@@ -885,7 +891,7 @@ def Validate(contents, flags, counters):
                                      s=contents,
                                      start_pos=start_pos)
 
-                actual = lx.TagName()
+                actual = lx.CanonicalTagName()
                 if expected != actual:
                     raise ParseError(
                         'Got unexpected closing tag %r; opening tag was %r' %

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -407,8 +407,19 @@ def testValid(self):
     '<a href=foo.html></a>',
     '<foo x="&"></foo>',
 
-    # TODO: capitalization should be allowed
-    #'<META><a></a>',
+    # caps
+    '<foo></FOO>',
+    '<Foo></fOO>',
+
+    # capital VOID tag
+    '<META><a></a>',
+
+    '<script><</script>',
+    '<SCRipt><</script>',
+
+    # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
+    # flag to handle this!  Gah I want something faster.
+    #'<script><</SCRIPT>',
 
     # TODO: Test <svg> and <math> ?
 ]
@@ -420,11 +431,9 @@ def testValid(self):
 INVALID_TAG_LEX = [
     # not allowed, but 3 < 4 is allowed
     '<p x="3 > 4"></p>',
+    # same thing
+    '<a href=">"></a>',
     '<a foo=bar !></a>',  # bad attr
-
-    # should be escaped
-    #'<a href="&"></a>',
-    #'<a href=">"></a>',
 ]