From 1b8213e291b3fdc968c88e8a0d553583d255c70b Mon Sep 17 00:00:00 2001 From: Andy C Date: Fri, 10 Jan 2025 13:19:30 -0500 Subject: [PATCH] [lazylex/html] Check for balanced tags Had to take into account VOID elements; otherwise there were too many erorrs. This uncovered many bugs. 1. Fix HTML gen bug in test/wild_report.py 2. Fix HTML gen bug in Soil We still have to fix the rest. --- doc/ysh-doc-processing.md | 16 ++++++++ lazylex/html.py | 83 +++++++++++++++++++++++++++++++++------ soil/web-worker.sh | 2 - test/wild_report.py | 2 +- 4 files changed, 89 insertions(+), 14 deletions(-) diff --git a/doc/ysh-doc-processing.md b/doc/ysh-doc-processing.md index c54dc25f4..95fc2cd5d 100644 --- a/doc/ysh-doc-processing.md +++ b/doc/ysh-doc-processing.md @@ -134,6 +134,22 @@ Safe HTML subset If you want to take user HTML, then you first use an HTML5 -> HT8 converter. +## Algorithms + +### Emitting HX8 as HTML5 + +Just emit it! This always work. + +### Converting HX8 to XML + +- Always quote all attributes +- Always quote `>` - are we alloxing this in HX8? +- Do something with ` or - we search until we see that self.search_state = None # type: Optional[str] + # Position of tag name, if applicable + # - Set after you get a StartTag, EndTag, or StartEndTag + # - Unset on other tags + self.tag_pos_left = -1 + self.tag_pos_right = -1 + def _Peek(self): # type: () -> Tuple[int, int] """ @@ -263,6 +274,14 @@ def _Peek(self): for pat, tok_id in LEXER: m = pat.match(self.s, self.pos) if m: + if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag): + self.tag_pos_left = m.start(1) + self.tag_pos_right = m.end(1) + else: + # Reset state + self.tag_pos_left = -1 + self.tag_pos_right = -1 + if tok_id == Tok.CommentBegin: pos = self.s.find('-->', self.pos) if pos == -1: @@ -285,16 +304,30 @@ def _Peek(self): return Tok.CData, pos + 3 # ]]> if tok_id == Tok.StartTag: - tag_name = m.group(1) # captured - if tag_name == 'script': + if self.TagNameEquals('script'): self.search_state = '' - elif tag_name == 'style': + elif self.TagNameEquals('style'): self.search_state = '' return tok_id, m.end() else: raise AssertionError('Tok.Invalid rule should have matched') + def TagNameEquals(self, expected): + # type: (str) -> bool + assert self.tag_pos_left != -1, self.tag_pos_left + assert self.tag_pos_right != -1, self.tag_pos_right + + # TODO: In C++, this does not need an allocation + return expected == self.s[self.tag_pos_left:self.tag_pos_right] + + def TagName(self): + # type: () -> None + assert self.tag_pos_left != -1, self.tag_pos_left + assert self.tag_pos_right != -1, self.tag_pos_right + + return self.s[self.tag_pos_left:self.tag_pos_right] + def Read(self): # type: () -> Tuple[int, int] tok_id, end_pos = self._Peek() @@ -607,6 +640,25 @@ def ToText(s, left_pos=0, right_pos=-1): return f.getvalue() +# https://developer.mozilla.org/en-US/docs/Glossary/Void_element +VOID_ELEMENTS = [ + 'area', + 'base', + 'br', + 'col', + 'embed', + 'hr', + 'img', + 'input', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', +] + + def main(argv): action = argv[1] @@ -625,13 +677,21 @@ def main(argv): contents = f.read() tag_lexer = TagLexer(contents) - lx = ValidTokens(contents) + lx = Lexer(contents) tokens = [] start_pos = 0 tag_stack = [] try: - for tok_id, end_pos in lx: + while True: + tok_id, end_pos = lx.Read() + + if tok_id == Tok.Invalid: + raise LexError(contents, start_pos) + if tok_id == Tok.EndOfStream: + break + tokens.append((tok_id, end_pos)) + if tok_id == Tok.StartEndTag: num_start_end_tags += 1 if action in ('lex-attrs', 'lex-attr-values', @@ -646,8 +706,10 @@ def main(argv): tag_lexer.Reset(start_pos, end_pos) all_attrs = tag_lexer.AllAttrsRaw() - # TODO: we need to get the tag name here - tag_stack.append('TODO') + tag_name = lx.TagName() + # Don't bother to check + if tag_name not in VOID_ELEMENTS: + tag_stack.append(tag_name) max_tag_stack = max(max_tag_stack, len(tag_stack)) elif tok_id == Tok.EndTag: try: @@ -657,8 +719,7 @@ def main(argv): s=contents, start_pos=start_pos) - # TODO: we need to get the tag name here - actual = 'TODO' + actual = lx.TagName() if expected != actual: raise ParseError( 'Expected closing tag %r, got %r' % diff --git a/soil/web-worker.sh b/soil/web-worker.sh index a8acecc62..86a4223d2 100755 --- a/soil/web-worker.sh +++ b/soil/web-worker.sh @@ -223,8 +223,6 @@ EOF image-layers.txt
image-layers.tsv
- - EOF table-sort-end image-layers diff --git a/test/wild_report.py b/test/wild_report.py index b69981d50..3e9a33a12 100755 --- a/test/wild_report.py +++ b/test/wild_report.py @@ -281,7 +281,7 @@ def MakeHtmlGroup(title_str, body_str): FAIL {parse_proc_secs} {.or} - OK + OK {parse_proc_secs} {.end}