Skip to content

Commit

Permalink
[lazylex/html] Canonicalize tag names to lower case
Browse files Browse the repository at this point in the history
- For tag matching
- For VOID element

Still have the problem to find </SCRIPT> or </SCRipt>.

Python HTMLParser.py has set_cdata_mode(), which does DYNAMIC
compilation of regexes.  We don't want to / can't really do that in C++.
  • Loading branch information
Andy C committed Jan 13, 2025
1 parent 07ebeba commit fb42513
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 14 deletions.
22 changes: 14 additions & 8 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,7 @@ def _Peek(self):
assert self.pos < self.right_pos, self.pos

if self.search_state is not None and not self.no_special_tags:
# TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
pos = self.s.find(self.search_state, self.pos)
if pos == -1:
# unterminated <script> or <style>
Expand Down Expand Up @@ -341,17 +342,22 @@ def TagNameEquals(self, expected):
assert self.tag_pos_left != -1, self.tag_pos_left
assert self.tag_pos_right != -1, self.tag_pos_right

# TODO: In C++, this does not need an allocation
# TODO: conditionally lower() case here (maybe not in XML mode)
return expected == self.s[self.tag_pos_left:self.tag_pos_right]
# TODO: In C++, this does not need an allocation. Can we test
# directly?
return expected == self.CanonicalTagName()

def TagName(self):
def CanonicalTagName(self):
# type: () -> None
assert self.tag_pos_left != -1, self.tag_pos_left
assert self.tag_pos_right != -1, self.tag_pos_right

# TODO: conditionally lower() case here (maybe not in XML mode)
return self.s[self.tag_pos_left:self.tag_pos_right]
tag_name = self.s[self.tag_pos_left:self.tag_pos_right]
# Most tags are already lower case, so avoid allocation with this conditional
# TODO: this could go in the mycpp runtime?
if tag_name.islower():
return tag_name
else:
return tag_name.lower()

def Read(self):
# type: () -> Tuple[int, int]
Expand Down Expand Up @@ -866,7 +872,7 @@ def Validate(contents, flags, counters):
counters.debug_attrs.extend(all_attrs)

if flags & BALANCED_TAGS:
tag_name = lx.TagName()
tag_name = lx.CanonicalTagName()
if flags & NO_SPECIAL_TAGS:
tag_stack.append(tag_name)
else:
Expand All @@ -885,7 +891,7 @@ def Validate(contents, flags, counters):
s=contents,
start_pos=start_pos)

actual = lx.TagName()
actual = lx.CanonicalTagName()
if expected != actual:
raise ParseError(
'Got unexpected closing tag %r; opening tag was %r' %
Expand Down
21 changes: 15 additions & 6 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,19 @@ def testValid(self):
'<a href=foo.html></a>',
'<foo x="&"></foo>',

# TODO: capitalization should be allowed
#'<META><a></a>',
# caps
'<foo></FOO>',
'<Foo></fOO>',

# capital VOID tag
'<META><a></a>',

'<script><</script>',
'<SCRipt><</script>',

# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
# flag to handle this! Gah I want something faster.
#'<script><</SCRIPT>',

# TODO: Test <svg> and <math> ?
]
Expand All @@ -420,11 +431,9 @@ def testValid(self):
INVALID_TAG_LEX = [
# not allowed, but 3 < 4 is allowed
'<p x="3 > 4"></p>',
# same thing
'<a href=">"></a>',
'<a foo=bar !></a>', # bad attr

# should be escaped
#'<a href="&"></a>',
#'<a href=">"></a>',
]


Expand Down

0 comments on commit fb42513

Please sign in to comment.