Skip to content

Commit

Permalink
[lazylex/html] Check for balanced tags
Browse files Browse the repository at this point in the history
Had to take into account VOID elements; otherwise there were too many
erorrs.

This uncovered many bugs.

1. Fix HTML gen bug in test/wild_report.py
2. Fix HTML gen bug in Soil

We still have to fix the rest.
  • Loading branch information
Andy C committed Jan 10, 2025
1 parent 2c267d2 commit 1b8213e
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 14 deletions.
16 changes: 16 additions & 0 deletions doc/ysh-doc-processing.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,22 @@ Safe HTML subset

If you want to take user HTML, then you first use an HTML5 -> HT8 converter.

## Algorithms

### Emitting HX8 as HTML5

Just emit it! This always work.

### Converting HX8 to XML

- Always quote all attributes
- Always quote `>` - are we alloxing this in HX8?
- Do something with `<script>` and `<style>`
- I guess turn them into normal tags, with escaping?
- Or maybe just disallow them?
- Maybe validate any other declarations, like `<!DOCTYPE foo>`
- Add XML header `<?xml version=>`, remove `<!DOCTYPE html>`

## Related

- [table-object-doc.html](table-object-doc.html)
83 changes: 72 additions & 11 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,13 @@ def MakeLexer(rules):
# Not necessary in HTML5, but occurs in XML
(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[

# NOTE: < is allowed in these?
(r'<! [^>]+ >', Tok.Decl), # <!DOCTYPE html>
# Markup declarations
# - In HTML5, there is only <!DOCTYPE html>
# - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
# - these seem to be part of DTD
# - it's useful to skip these, and be able to parse the rest of the document
# - Note: < is allowed?
(r'<! [^>]+ >', Tok.Decl),

# Tags
# Notes:
Expand Down Expand Up @@ -237,6 +242,12 @@ def __init__(self, s, left_pos=0, right_pos=-1):
# either </script> or </style> - we search until we see that
self.search_state = None # type: Optional[str]

# Position of tag name, if applicable
# - Set after you get a StartTag, EndTag, or StartEndTag
# - Unset on other tags
self.tag_pos_left = -1
self.tag_pos_right = -1

def _Peek(self):
# type: () -> Tuple[int, int]
"""
Expand All @@ -263,6 +274,14 @@ def _Peek(self):
for pat, tok_id in LEXER:
m = pat.match(self.s, self.pos)
if m:
if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
self.tag_pos_left = m.start(1)
self.tag_pos_right = m.end(1)
else:
# Reset state
self.tag_pos_left = -1
self.tag_pos_right = -1

if tok_id == Tok.CommentBegin:
pos = self.s.find('-->', self.pos)
if pos == -1:
Expand All @@ -285,16 +304,30 @@ def _Peek(self):
return Tok.CData, pos + 3 # ]]>

if tok_id == Tok.StartTag:
tag_name = m.group(1) # captured
if tag_name == 'script':
if self.TagNameEquals('script'):
self.search_state = '</script>'
elif tag_name == 'style':
elif self.TagNameEquals('style'):
self.search_state = '</style>'

return tok_id, m.end()
else:
raise AssertionError('Tok.Invalid rule should have matched')

def TagNameEquals(self, expected):
# type: (str) -> bool
assert self.tag_pos_left != -1, self.tag_pos_left
assert self.tag_pos_right != -1, self.tag_pos_right

# TODO: In C++, this does not need an allocation
return expected == self.s[self.tag_pos_left:self.tag_pos_right]

def TagName(self):
# type: () -> None
assert self.tag_pos_left != -1, self.tag_pos_left
assert self.tag_pos_right != -1, self.tag_pos_right

return self.s[self.tag_pos_left:self.tag_pos_right]

def Read(self):
# type: () -> Tuple[int, int]
tok_id, end_pos = self._Peek()
Expand Down Expand Up @@ -607,6 +640,25 @@ def ToText(s, left_pos=0, right_pos=-1):
return f.getvalue()


# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
VOID_ELEMENTS = [
'area',
'base',
'br',
'col',
'embed',
'hr',
'img',
'input',
'link',
'meta',
'param',
'source',
'track',
'wbr',
]


def main(argv):
action = argv[1]

Expand All @@ -625,13 +677,21 @@ def main(argv):
contents = f.read()

tag_lexer = TagLexer(contents)
lx = ValidTokens(contents)
lx = Lexer(contents)
tokens = []
start_pos = 0
tag_stack = []
try:
for tok_id, end_pos in lx:
while True:
tok_id, end_pos = lx.Read()

if tok_id == Tok.Invalid:
raise LexError(contents, start_pos)
if tok_id == Tok.EndOfStream:
break

tokens.append((tok_id, end_pos))

if tok_id == Tok.StartEndTag:
num_start_end_tags += 1
if action in ('lex-attrs', 'lex-attr-values',
Expand All @@ -646,8 +706,10 @@ def main(argv):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()

# TODO: we need to get the tag name here
tag_stack.append('TODO')
tag_name = lx.TagName()
# Don't bother to check
if tag_name not in VOID_ELEMENTS:
tag_stack.append(tag_name)
max_tag_stack = max(max_tag_stack, len(tag_stack))
elif tok_id == Tok.EndTag:
try:
Expand All @@ -657,8 +719,7 @@ def main(argv):
s=contents,
start_pos=start_pos)

# TODO: we need to get the tag name here
actual = 'TODO'
actual = lx.TagName()
if expected != actual:
raise ParseError(
'Expected closing tag %r, got %r' %
Expand Down
2 changes: 0 additions & 2 deletions soil/web-worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,6 @@ EOF
<a href="image-layers.txt">image-layers.txt</a> <br/>
<a href="image-layers.tsv">image-layers.tsv</a> <br/>
</body>
</html>
EOF

table-sort-end image-layers
Expand Down
2 changes: 1 addition & 1 deletion test/wild_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,7 @@ def MakeHtmlGroup(title_str, body_str):
<a class="fail" href="#stderr_parse_{name}">FAIL</a>
<td>{parse_proc_secs}</td>
{.or}
<span class="ok">OK</a>
<span class="ok">OK</span>
<td>{parse_proc_secs}</td>
{.end}
</td>
Expand Down

0 comments on commit 1b8213e

Please sign in to comment.