From 1b8213e291b3fdc968c88e8a0d553583d255c70b Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Fri, 10 Jan 2025 13:19:30 -0500
Subject: [PATCH] [lazylex/html] Check for balanced tags

Had to take into account VOID elements; otherwise there were too many
erorrs.

This uncovered many bugs.

1. Fix HTML gen bug in test/wild_report.py
2. Fix HTML gen bug in Soil

We still have to fix the rest.
---
 doc/ysh-doc-processing.md | 16 ++++++++
 lazylex/html.py           | 83 +++++++++++++++++++++++++++++++++------
 soil/web-worker.sh        |  2 -
 test/wild_report.py       |  2 +-
 4 files changed, 89 insertions(+), 14 deletions(-)
diff --git a/doc/ysh-doc-processing.md b/doc/ysh-doc-processing.md
index c54dc25f4..95fc2cd5d 100644
--- a/doc/ysh-doc-processing.md
+++ b/doc/ysh-doc-processing.md
@@ -134,6 +134,22 @@ Safe HTML subset
 
 If you want to take user HTML, then you first use an HTML5 -> HT8 converter.
 
+## Algorithms
+
+### Emitting HX8 as HTML5
+
+Just emit it!  This always work.
+
+### Converting HX8 to XML
+
+- Always quote all attributes
+- Always quote `>` - are we alloxing this in HX8?
+- Do something with `<script>` and `<style>`
+  - I guess turn them into normal tags, with escaping?
+  - Or maybe just disallow them?
+- Maybe validate any other declarations, like `<!DOCTYPE foo>`
+- Add XML header `<?xml version=>`, remove `<!DOCTYPE html>`
+
 ## Related
 
 - [table-object-doc.html](table-object-doc.html)
diff --git a/lazylex/html.py b/lazylex/html.py
index 1072971ec..2cd332f12 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -176,8 +176,13 @@ def MakeLexer(rules):
     # Not necessary in HTML5, but occurs in XML
     (r'<!\[CDATA\[', Tok.CDataBegin),  # <![CDATA[
 
-    # NOTE: < is allowed in these?
-    (r'<! [^>]+ >', Tok.Decl),  # <!DOCTYPE html>
+    # Markup declarations
+    # - In HTML5, there is only <!DOCTYPE html>
+    # - XML has 4 more declarations: <!ELEMENT ...> ATTLIST ENTITY NOTATION
+    #   - these seem to be part of DTD
+    #   - it's useful to skip these, and be able to parse the rest of the document
+    # - Note: < is allowed?
+    (r'<! [^>]+ >', Tok.Decl),
 
     # Tags
     # Notes:
@@ -237,6 +242,12 @@ def __init__(self, s, left_pos=0, right_pos=-1):
         # either </script> or </style> - we search until we see that
         self.search_state = None  # type: Optional[str]
 
+        # Position of tag name, if applicable
+        # - Set after you get a StartTag, EndTag, or StartEndTag
+        # - Unset on other tags
+        self.tag_pos_left = -1
+        self.tag_pos_right = -1
+
     def _Peek(self):
         # type: () -> Tuple[int, int]
         """
@@ -263,6 +274,14 @@ def _Peek(self):
         for pat, tok_id in LEXER:
             m = pat.match(self.s, self.pos)
             if m:
+                if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
+                    self.tag_pos_left = m.start(1)
+                    self.tag_pos_right = m.end(1)
+                else:
+                    # Reset state
+                    self.tag_pos_left = -1
+                    self.tag_pos_right = -1
+
                 if tok_id == Tok.CommentBegin:
                     pos = self.s.find('-->', self.pos)
                     if pos == -1:
@@ -285,16 +304,30 @@ def _Peek(self):
                     return Tok.CData, pos + 3  # ]]>
 
                 if tok_id == Tok.StartTag:
-                    tag_name = m.group(1)  # captured
-                    if tag_name == 'script':
+                    if self.TagNameEquals('script'):
                         self.search_state = '</script>'
-                    elif tag_name == 'style':
+                    elif self.TagNameEquals('style'):
                         self.search_state = '</style>'
 
                 return tok_id, m.end()
         else:
             raise AssertionError('Tok.Invalid rule should have matched')
 
+    def TagNameEquals(self, expected):
+        # type: (str) -> bool
+        assert self.tag_pos_left != -1, self.tag_pos_left
+        assert self.tag_pos_right != -1, self.tag_pos_right
+
+        # TODO: In C++, this does not need an allocation
+        return expected == self.s[self.tag_pos_left:self.tag_pos_right]
+
+    def TagName(self):
+        # type: () -> None
+        assert self.tag_pos_left != -1, self.tag_pos_left
+        assert self.tag_pos_right != -1, self.tag_pos_right
+
+        return self.s[self.tag_pos_left:self.tag_pos_right]
+
     def Read(self):
         # type: () -> Tuple[int, int]
         tok_id, end_pos = self._Peek()
@@ -607,6 +640,25 @@ def ToText(s, left_pos=0, right_pos=-1):
     return f.getvalue()
 
 
+# https://developer.mozilla.org/en-US/docs/Glossary/Void_element
+VOID_ELEMENTS = [
+    'area',
+    'base',
+    'br',
+    'col',
+    'embed',
+    'hr',
+    'img',
+    'input',
+    'link',
+    'meta',
+    'param',
+    'source',
+    'track',
+    'wbr',
+]
+
+
 def main(argv):
     action = argv[1]
 
@@ -625,13 +677,21 @@ def main(argv):
                 contents = f.read()
 
             tag_lexer = TagLexer(contents)
-            lx = ValidTokens(contents)
+            lx = Lexer(contents)
             tokens = []
             start_pos = 0
             tag_stack = []
             try:
-                for tok_id, end_pos in lx:
+                while True:
+                    tok_id, end_pos = lx.Read()
+
+                    if tok_id == Tok.Invalid:
+                        raise LexError(contents, start_pos)
+                    if tok_id == Tok.EndOfStream:
+                        break
+
                     tokens.append((tok_id, end_pos))
+
                     if tok_id == Tok.StartEndTag:
                         num_start_end_tags += 1
                         if action in ('lex-attrs', 'lex-attr-values',
@@ -646,8 +706,10 @@ def main(argv):
                             tag_lexer.Reset(start_pos, end_pos)
                             all_attrs = tag_lexer.AllAttrsRaw()
 
-                            # TODO: we need to get the tag name here
-                            tag_stack.append('TODO')
+                            tag_name = lx.TagName()
+                            # Don't bother to check
+                            if tag_name not in VOID_ELEMENTS:
+                                tag_stack.append(tag_name)
                             max_tag_stack = max(max_tag_stack, len(tag_stack))
                     elif tok_id == Tok.EndTag:
                         try:
@@ -657,8 +719,7 @@ def main(argv):
                                              s=contents,
                                              start_pos=start_pos)
 
-                        # TODO: we need to get the tag name here
-                        actual = 'TODO'
+                        actual = lx.TagName()
                         if expected != actual:
                             raise ParseError(
                                 'Expected closing tag %r, got %r' %
diff --git a/soil/web-worker.sh b/soil/web-worker.sh
index a8acecc62..86a4223d2 100755
--- a/soil/web-worker.sh
+++ b/soil/web-worker.sh
@@ -223,8 +223,6 @@ EOF
 
     <a href="image-layers.txt">image-layers.txt</a> <br/>
     <a href="image-layers.tsv">image-layers.tsv</a> <br/>
-  </body>
-</html>
 EOF
 
   table-sort-end image-layers
diff --git a/test/wild_report.py b/test/wild_report.py
index b69981d50..3e9a33a12 100755
--- a/test/wild_report.py
+++ b/test/wild_report.py
@@ -281,7 +281,7 @@ def MakeHtmlGroup(title_str, body_str):
           <a class="fail" href="#stderr_parse_{name}">FAIL</a>
           <td>{parse_proc_secs}</td>
         {.or}
-          <span class="ok">OK</a>
+          <span class="ok">OK</span>
           <td>{parse_proc_secs}</td>
         {.end}
       </td>