[lazylex/html] Options for lexing/parsing

- NO_SPECIAL_TAGS: this is XML mode, basically - BALANCED_TAGS: we can skip the check for balanced tags - for old HTML with bugs, which I fixed at HEAD
oils-for-unix · Jan 11, 2025 · eeecaf6 · eeecaf6
1 parent 708b785
commit eeecaf6
Show file tree

Hide file tree

Showing 3 changed files with 126 additions and 69 deletions.
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
@@ -4,42 +4,53 @@
 #   data_lang/htm8-test.sh
 #
 # TODO:
-# - Refactor Validate(): take FLAGS, return stats optionally
-#   - add LEX_QUOTED_VALUES
-#   - and then re-run all the tests
-# - Rename to data_lang/htm8.py
-#   - it has NO_SPECIAL_TAGS mode for XML
-#
-# - Soil
-#   - Validate all the HTML in the repo - well-formed check
-#     - this should go in the CI
-#   - Automate some more tests:
+# - Validate()
+#   - add LEX_QUOTED_VALUES, along with counter for it
+#   - and then re-run all the tests - make sure they pass
 #     - site oils.pub, site oilshell.org
 #     - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
+# - Rename to data_lang/htm8.py
+#   - it has NO_SPECIAL_TAGS mode for XML
+#   - put iterators at a higher level in doctools/ ?
 #
 # - statically type it
 #   - revive pyannotate
 # - translate to C++
-#   - what to do about all the regexes?  Port to re2c directly?
+#   - how to handle the regexes in the lexer?  Port to re2c directly?
 #   - for find(), do we need a C++ primitive for it?
 #   - no allocation for TagName()
 #   - ASDL file for Tok.Foo?
 # - refactor TagName() API - remove it from the TagLexer?
 #   - that is really the AttrLexer()
 #
-# - build a DOM with objects in YSH?
-#   - rewrite ul-table in that?
+# Not working yet:
+# - understanding all entities &zz;
+#   - there are over 2000 of them, not sure I want to build them all into the Oils binaries
+# - capital letters <TR/> - I guess we can normalize the case
 #
 # YSH API
 # - Generating HTML/HTM8 is much more common than parsing it
 #   - although maybe we can do RemoveComments as a demo?
 #   - that is the lowest level "sed" model
 # - For parsing, a minimum idea is:
 #   - lexer-based algorithms for query by tag, class name, and id
-#   - and then toTree()
+#   - and then toTree() - this is a DOM
 #     - .tag and .attrs?
 #     - .innerHTML() and .outerHTML() perhaps
-#     - and maybe you can mutate it directly
+#    - rewrite ul-table in that?
+#      - does that mean you mutate it, or construct text?
+#      - I think you can set the innerHTML probably
+#
+# - Testing of html.ysh aka htm8.ysh in the stdlib
+#
+# Cases:
+#   html 'hello <b>world</b>'
+#   html "hello <b>$name</b>"html
+#   html ["hello <b>$name</b>"]  # hm this isn't bad, it's an unevaluated expression?
+#   commonmark 'hello **world**'
+#   md 'hello **world**'
+#   md ['hello **$escape**'] ?   We don't have a good escaping algorithm
+
 
 REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
 
@@ -96,10 +107,14 @@ test-site() {
   #     - test that each quoted attribute lexes
   # - test that tags are balanced
 
+  local dir
+  local action
   if test -n "$new_site"; then
     dir='../oils.pub__deploy'
+    action='parse-htm8'
   else
     dir='../../oilshell/oilshell.org__deploy'
+    action='lex-htm8'
   fi
 
   pushd $dir
@@ -108,7 +123,7 @@ test-site() {
   # site-files | xargs wc -l | grep total
 
   # Not using xargs
-  time site-files | $REPO_ROOT/$0 htm8-tool validate
+  time site-files | $REPO_ROOT/$0 htm8-tool $action
 
   popd
 }
@@ -145,7 +160,7 @@ tree-wwz() {
 test-wwz() {
   pushd $WWZ_DIR
 
-  time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
+  time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
 
   popd
 }
@@ -157,21 +172,21 @@ find-xml() {
 test-other-xml() {
   # problem with &ent1;
   # CDATA support!  haha OK
-  time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool validate
+  time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
 }
 
 test-repo-xml() {
   # OK these parse
   time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
-    | $REPO_ROOT/$0 htm8-tool validate
+    | $REPO_ROOT/$0 htm8-tool parse-xml
 }
 
 test-repo-html() {
-  time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
+  time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
 }
 
 test-docs() {
-  time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
+  time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
 }
 
 soil-run() {

diff --git a/lazylex/html.py b/lazylex/html.py
@@ -267,10 +267,12 @@ def MakeLexer(rules):
 
 class Lexer(object):
 
-    def __init__(self, s, left_pos=0, right_pos=-1):
+    def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
         self.s = s
         self.pos = left_pos
         self.right_pos = len(s) if right_pos == -1 else right_pos
+        self.no_special_tags = no_special_tags
+
         self.cache = {}  # string -> compiled regex pattern object
 
         # either </script> or </style> - we search until we see that
@@ -292,7 +294,7 @@ def _Peek(self):
 
         assert self.pos < self.right_pos, self.pos
 
-        if self.search_state is not None:
+        if self.search_state is not None and not self.no_special_tags:
             pos = self.s.find(self.search_state, self.pos)
             if pos == -1:
                 # unterminated <script> or <style>
@@ -353,13 +355,15 @@ def TagNameEquals(self, expected):
         assert self.tag_pos_right != -1, self.tag_pos_right
 
         # TODO: In C++, this does not need an allocation
+        # TODO: conditionally lower() case here (maybe not in XML mode)
         return expected == self.s[self.tag_pos_left:self.tag_pos_right]
 
     def TagName(self):
         # type: () -> None
         assert self.tag_pos_left != -1, self.tag_pos_left
         assert self.tag_pos_right != -1, self.tag_pos_right
 
+        # TODO: conditionally lower() case here (maybe not in XML mode)
         return self.s[self.tag_pos_left:self.tag_pos_right]
 
     def Read(self):
@@ -409,6 +413,23 @@ def ValidTokens(s, left_pos=0, right_pos=-1):
         pos = end_pos
 
 
+def ValidTokenList(s, no_special_tags=False):
+    """A wrapper that can be more easily translated to C++.  Doesn't use iterators."""
+
+    start_pos = 0
+    tokens = []
+    lx = Lexer(s, no_special_tags=no_special_tags)
+    while True:
+        tok_id, end_pos = lx.Read()
+        tokens.append((tok_id, end_pos))
+        if tok_id == Tok.EndOfStream:
+            break
+        if tok_id == Tok.Invalid:
+            raise LexError(s, start_pos)
+        start_pos = end_pos
+    return tokens
+
+
 # Tag names:
 #   Match <a  or </a
 #   Match <h2, but not <2h
@@ -695,16 +716,15 @@ def ToText(s, left_pos=0, right_pos=-1):
 LEX_ATTRS = 1 << 1
 LEX_QUOTED_VALUES = 1 << 2  # href="?x=42&amp;y=99"
 NO_SPECIAL_TAGS = 1 << 3  # <script> <style>, VOID tags, etc.
-CHECK_TAGS = 1 << 4  # balancing tags
+BALANCED_TAGS = 1 << 4  # are tags balanced?
 
 
-def Validate(contents, flags, counters=None):
-    # type: (str, int, Optional[Dict[str, int]]) -> None
-
-    action = 'well-formed'
+def Validate(contents, flags, counters):
+    # type: (str, int, Counters) -> None
 
     tag_lexer = TagLexer(contents)
-    lx = Lexer(contents)
+    no_special_tags = bool(flags & NO_SPECIAL_TAGS)
+    lx = Lexer(contents, no_special_tags=no_special_tags)
     tokens = []
     start_pos = 0
     tag_stack = []
@@ -720,39 +740,45 @@ def Validate(contents, flags, counters=None):
 
         if tok_id == Tok.StartEndTag:
             counters.num_start_end_tags += 1
-            if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
-                tag_lexer.Reset(start_pos, end_pos)
-                all_attrs = tag_lexer.AllAttrsRaw()
-                counters.num_attrs += len(all_attrs)
+
+            tag_lexer.Reset(start_pos, end_pos)
+            all_attrs = tag_lexer.AllAttrsRaw()
+            counters.num_attrs += len(all_attrs)
+
         elif tok_id == Tok.StartTag:
             counters.num_start_tags += 1
-            if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
-                tag_lexer.Reset(start_pos, end_pos)
-                all_attrs = tag_lexer.AllAttrsRaw()
-                counters.num_attrs += len(all_attrs)
 
+            tag_lexer.Reset(start_pos, end_pos)
+            all_attrs = tag_lexer.AllAttrsRaw()
+            counters.num_attrs += len(all_attrs)
+
+            if flags & BALANCED_TAGS:
                 tag_name = lx.TagName()
-                # Don't bother to check
-                if tag_name not in VOID_ELEMENTS:
+                if flags & NO_SPECIAL_TAGS:
                     tag_stack.append(tag_name)
+                else:
+                    # e.g. <meta> is considered self-closing, like <meta/>
+                    if tag_name not in VOID_ELEMENTS:
+                        tag_stack.append(tag_name)
 
-                counters.max_tag_stack = max(counters.max_tag_stack,
-                                             len(tag_stack))
+            counters.max_tag_stack = max(counters.max_tag_stack,
+                                         len(tag_stack))
         elif tok_id == Tok.EndTag:
-            try:
-                expected = tag_stack.pop()
-            except IndexError:
-                raise ParseError('Tag stack empty',
-                                 s=contents,
-                                 start_pos=start_pos)
-
-            actual = lx.TagName()
-            if expected != actual:
-                raise ParseError(
-                    'Got unexpected closing tag %r; opening tag was %r' %
-                    (contents[start_pos:end_pos], expected),
-                    s=contents,
-                    start_pos=start_pos)
+            if flags & BALANCED_TAGS:
+                try:
+                    expected = tag_stack.pop()
+                except IndexError:
+                    raise ParseError('Tag stack empty',
+                                     s=contents,
+                                     start_pos=start_pos)
+
+                actual = lx.TagName()
+                if expected != actual:
+                    raise ParseError(
+                        'Got unexpected closing tag %r; opening tag was %r' %
+                        (contents[start_pos:end_pos], expected),
+                        s=contents,
+                        start_pos=start_pos)
 
         start_pos = end_pos
     counters.num_tokens += len(tokens)
@@ -789,21 +815,25 @@ def main(argv):
 
         return 0
 
-    elif action == 'validate':
+    elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):
 
         errors = []
         counters = Counters()
 
+        flags = LEX_ATTRS | LEX_QUOTED_VALUES
+        if action.startswith('parse-'):
+            flags |= BALANCED_TAGS
+        if action == 'parse-xml':
+            flags |= NO_SPECIAL_TAGS
+
         i = 0
         for line in sys.stdin:
             filename = line.strip()
             with open(filename) as f:
                 contents = f.read()
 
-            # TODO: xml version with NO_SPECIAL_TAGS
             try:
-                Validate(contents, LEX_ATTRS | LEX_QUOTED_VALUES | CHECK_TAGS,
-                         counters)
+                Validate(contents, flags, counters)
             except LexError as e:
                 log('Lex error in %r: %s', filename, e)
                 errors.append((filename, e))

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -105,10 +105,9 @@ def testAllAttrs(self):
         self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())
 
 
-def Lex(h):
+def Lex(h, no_special_tags=False):
     print(repr(h))
-    lex = html.ValidTokens(h)
-    tokens = list(lex)
+    tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
     start_pos = 0
     for tok_id, end_pos in tokens:
         frag = h[start_pos:end_pos]
@@ -132,10 +131,7 @@ def testPstrip(self):
 
     def testCommentParse(self):
         n = len(TEST_HTML)
-        for tok_id, end_pos in html._Tokens(TEST_HTML, 0, n):
-            if tok_id == html.Invalid:
-                raise RuntimeError()
-            print(tok_id)
+        tokens = Lex(TEST_HTML)
 
     def testCommentParse2(self):
 
@@ -187,6 +183,24 @@ def testScriptStyle(self):
             ],
             tokens)
 
+    def testScriptStyleXml(self):
+        Tok = html.Tok
+        h = 'hi <script src=""> &lt; </script>'
+        # XML mode
+        tokens = Lex(h, no_special_tags=True)
+
+        self.assertEqual(
+            [
+                (Tok.RawData, 3),
+                (Tok.StartTag, 18),  # <script>
+                (Tok.RawData, 19),  # space
+                (Tok.CharEntity, 23),  # </script>
+                (Tok.RawData, 24),  # \n
+                (Tok.EndTag, 33),  # \n
+                (Tok.EndOfStream, 33),  # \n
+            ],
+            tokens)
+
     def testCData(self):
         Tok = html.Tok
 
@@ -279,10 +293,8 @@ def testInvalid(self):
         ]
 
         for s in INVALID:
-            lex = html.ValidTokens(s)
             try:
-                for i in xrange(5):
-                    tok_id, pos = next(lex)
+                tokens = html.ValidTokenList(s)
             except html.LexError as e:
                 print(e)
             else: