[lazylex/html] Validate our HTML in Soil

All errors have been fixed! Refactor into Validate() function. Still needs more polish.
oils-for-unix · Jan 11, 2025 · 708b785 · 708b785
1 parent 72d3784
commit 708b785
Show file tree

Hide file tree

Showing 4 changed files with 167 additions and 98 deletions.
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
@@ -4,20 +4,42 @@
 #   data_lang/htm8-test.sh
 #
 # TODO:
-# - Rename to DML8?  Because it can handle XML
-# - CDATA in XML, which is not a script
+# - Refactor Validate(): take FLAGS, return stats optionally
+#   - add LEX_QUOTED_VALUES
+#   - and then re-run all the tests
+# - Rename to data_lang/htm8.py
+#   - it has NO_SPECIAL_TAGS mode for XML
 #
-# Operations / Levels:
+# - Soil
+#   - Validate all the HTML in the repo - well-formed check
+#     - this should go in the CI
+#   - Automate some more tests:
+#     - site oils.pub, site oilshell.org
+#     - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
 #
-# - Lexing
-#   - lex-tags
-#   - lex-attrs - validate all Start tags, all StartEnd tags
-#   - lex-quoted-values - unescaping, etc.
-#     - are there invalid entities?
-# - Parsing
-#   - well-formed / tag balance check
-# - Schema
-#   - not sure if we check the HTML schema or not - it might be too restrictive
+# - statically type it
+#   - revive pyannotate
+# - translate to C++
+#   - what to do about all the regexes?  Port to re2c directly?
+#   - for find(), do we need a C++ primitive for it?
+#   - no allocation for TagName()
+#   - ASDL file for Tok.Foo?
+# - refactor TagName() API - remove it from the TagLexer?
+#   - that is really the AttrLexer()
+#
+# - build a DOM with objects in YSH?
+#   - rewrite ul-table in that?
+#
+# YSH API
+# - Generating HTML/HTM8 is much more common than parsing it
+#   - although maybe we can do RemoveComments as a demo?
+#   - that is the lowest level "sed" model
+# - For parsing, a minimum idea is:
+#   - lexer-based algorithms for query by tag, class name, and id
+#   - and then toTree()
+#     - .tag and .attrs?
+#     - .innerHTML() and .outerHTML() perhaps
+#     - and maybe you can mutate it directly
 
 REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
 
@@ -42,7 +64,7 @@ site-files() {
 #   - can we change that with [.\n]*?
 # - nongreedy match for --> and ?>
 
-ht8-tool() {
+htm8-tool() {
   PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
     $REPO_ROOT/lazylex/html.py "$@"
 }
@@ -52,7 +74,7 @@ test-well-formed() {
 unfinished <!--
 hi && bye
 EOF
-  echo '_tmp/bad.html' | ht8-tool well-formed 
+  echo '_tmp/bad.html' | htm8-tool well-formed 
 }
 
 # site errors
@@ -86,12 +108,12 @@ test-site() {
   # site-files | xargs wc -l | grep total
 
   # Not using xargs
-  time site-files | $REPO_ROOT/$0 ht8-tool well-formed
+  time site-files | $REPO_ROOT/$0 htm8-tool validate
 
   popd
 }
 
-readonly SOIL_ID=8915
+readonly SOIL_ID=8917
 readonly WWZ_DIR=_tmp/$SOIL_ID
 
 sync-wwz() {
@@ -123,7 +145,7 @@ tree-wwz() {
 test-wwz() {
   pushd $WWZ_DIR
 
-  time find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
+  time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
 
   popd
 }
@@ -135,13 +157,25 @@ find-xml() {
 test-other-xml() {
   # problem with &ent1;
   # CDATA support!  haha OK
-  time cat _tmp/xml-files.txt | $REPO_ROOT/$0 ht8-tool well-formed
+  time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool validate
 }
 
 test-repo-xml() {
   # OK these parse
   time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
-    | $REPO_ROOT/$0 ht8-tool well-formed
+    | $REPO_ROOT/$0 htm8-tool validate
+}
+
+test-repo-html() {
+  time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
+}
+
+test-docs() {
+  time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
+}
+
+soil-run() {
+  test-docs
 }
 
 # OK we have to skip the <script> tag!  And <style>

diff --git a/lazylex/html.py b/lazylex/html.py
@@ -4,9 +4,6 @@
 
 See lazylex/README.md for details.
 
-TODO: This should be an Oils library eventually.  It's a "lazily-parsed data
-structure" like TSV8
-
 Conflicts between HTML5 and XML:
 
 - In XML, <source> is like any tag, and must be closed,
@@ -18,6 +15,12 @@
 - The header is different - <!DOCTYPE html> vs.  <?xml version= ... ?>
 
 So do have a mode for <script> <style> and void tags?  Upgrade HX8 into HTM8?
+
+TODO:
+
+- Are there special rules for <svg> and <math>?
+- Do we need to know about <textarea> <pre>?  Those don't have the same
+  whitespace rules
 """
 from __future__ import print_function
 
@@ -29,7 +32,7 @@
 import sys
 
 if sys.version_info.major == 2:
-    from typing import List, Tuple, Optional
+    from typing import List, Tuple, Optional, Dict
 
 
 def log(msg, *args):
@@ -689,6 +692,81 @@ def ToText(s, left_pos=0, right_pos=-1):
     'wbr',
 ]
 
+LEX_ATTRS = 1 << 1
+LEX_QUOTED_VALUES = 1 << 2  # href="?x=42&amp;y=99"
+NO_SPECIAL_TAGS = 1 << 3  # <script> <style>, VOID tags, etc.
+CHECK_TAGS = 1 << 4  # balancing tags
+
+
+def Validate(contents, flags, counters=None):
+    # type: (str, int, Optional[Dict[str, int]]) -> None
+
+    action = 'well-formed'
+
+    tag_lexer = TagLexer(contents)
+    lx = Lexer(contents)
+    tokens = []
+    start_pos = 0
+    tag_stack = []
+    while True:
+        tok_id, end_pos = lx.Read()
+
+        if tok_id == Tok.Invalid:
+            raise LexError(contents, start_pos)
+        if tok_id == Tok.EndOfStream:
+            break
+
+        tokens.append((tok_id, end_pos))
+
+        if tok_id == Tok.StartEndTag:
+            counters.num_start_end_tags += 1
+            if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
+                tag_lexer.Reset(start_pos, end_pos)
+                all_attrs = tag_lexer.AllAttrsRaw()
+                counters.num_attrs += len(all_attrs)
+        elif tok_id == Tok.StartTag:
+            counters.num_start_tags += 1
+            if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
+                tag_lexer.Reset(start_pos, end_pos)
+                all_attrs = tag_lexer.AllAttrsRaw()
+                counters.num_attrs += len(all_attrs)
+
+                tag_name = lx.TagName()
+                # Don't bother to check
+                if tag_name not in VOID_ELEMENTS:
+                    tag_stack.append(tag_name)
+
+                counters.max_tag_stack = max(counters.max_tag_stack,
+                                             len(tag_stack))
+        elif tok_id == Tok.EndTag:
+            try:
+                expected = tag_stack.pop()
+            except IndexError:
+                raise ParseError('Tag stack empty',
+                                 s=contents,
+                                 start_pos=start_pos)
+
+            actual = lx.TagName()
+            if expected != actual:
+                raise ParseError(
+                    'Got unexpected closing tag %r; opening tag was %r' %
+                    (contents[start_pos:end_pos], expected),
+                    s=contents,
+                    start_pos=start_pos)
+
+        start_pos = end_pos
+    counters.num_tokens += len(tokens)
+
+
+class Counters(object):
+
+    def __init__(self):
+        self.num_tokens = 0
+        self.num_start_tags = 0
+        self.num_start_end_tags = 0
+        self.num_attrs = 0
+        self.max_tag_stack = 0
+
 
 def main(argv):
     action = argv[1]
@@ -711,98 +789,54 @@ def main(argv):
 
         return 0
 
-    elif action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
-        num_tokens = 0
-        num_start_tags = 0
-        num_start_end_tags = 0
-        num_attrs = 0
-        max_tag_stack = 0
+    elif action == 'validate':
 
         errors = []
+        counters = Counters()
+
         i = 0
         for line in sys.stdin:
-            name = line.strip()
-            with open(name) as f:
+            filename = line.strip()
+            with open(filename) as f:
                 contents = f.read()
 
-            tag_lexer = TagLexer(contents)
-            lx = Lexer(contents)
-            tokens = []
-            start_pos = 0
-            tag_stack = []
+            # TODO: xml version with NO_SPECIAL_TAGS
             try:
-                while True:
-                    tok_id, end_pos = lx.Read()
-
-                    if tok_id == Tok.Invalid:
-                        raise LexError(contents, start_pos)
-                    if tok_id == Tok.EndOfStream:
-                        break
-
-                    tokens.append((tok_id, end_pos))
-
-                    if tok_id == Tok.StartEndTag:
-                        num_start_end_tags += 1
-                        if action in ('lex-attrs', 'lex-attr-values',
-                                      'well-formed'):
-                            tag_lexer.Reset(start_pos, end_pos)
-                            all_attrs = tag_lexer.AllAttrsRaw()
-                            num_attrs += len(all_attrs)
-                    elif tok_id == Tok.StartTag:
-                        num_start_tags += 1
-                        if action in ('lex-attrs', 'lex-attr-values',
-                                      'well-formed'):
-                            tag_lexer.Reset(start_pos, end_pos)
-                            all_attrs = tag_lexer.AllAttrsRaw()
-
-                            tag_name = lx.TagName()
-                            # Don't bother to check
-                            if tag_name not in VOID_ELEMENTS:
-                                tag_stack.append(tag_name)
-
-                            max_tag_stack = max(max_tag_stack, len(tag_stack))
-                    elif tok_id == Tok.EndTag:
-                        try:
-                            expected = tag_stack.pop()
-                        except IndexError:
-                            raise ParseError('Tag stack empty',
-                                             s=contents,
-                                             start_pos=start_pos)
-
-                        actual = lx.TagName()
-                        if expected != actual:
-                            raise ParseError(
-                                'Got unexpected closing tag %r; opening tag was %r'
-                                % (contents[start_pos:end_pos], expected),
-                                s=contents,
-                                start_pos=start_pos)
-
-                    start_pos = end_pos
+                Validate(contents, LEX_ATTRS | LEX_QUOTED_VALUES | CHECK_TAGS,
+                         counters)
             except LexError as e:
-                log('Lex error in %r: %s', name, e)
-                errors.append((name, e))
+                log('Lex error in %r: %s', filename, e)
+                errors.append((filename, e))
             except ParseError as e:
-                log('Parse error in %r: %s', name, e)
-                errors.append((name, e))
-            else:
-                num_tokens += len(tokens)
-
-            #print('%d %s' % (len(tokens), name))
+                log('Parse error in %r: %s', filename, e)
+                errors.append((filename, e))
             i += 1
 
         log('')
         log(
             '  %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
-            num_tokens, num_start_end_tags, num_start_tags, num_attrs,
-            max_tag_stack, i)
+            counters.num_tokens, counters.num_start_end_tags,
+            counters.num_start_tags, counters.num_attrs,
+            counters.max_tag_stack, i)
         log('  %d errors', len(errors))
-        if 0:
-            for name, e in errors:
-                log('Error in %r: %s', name, e)
+        if len(errors):
+            return 1
+        return 0
+
+    elif action == 'todo':
+        # Other algorithms:
+        #
+        # - select first subtree with given ID
+        #   - this requires understanding the void tags I suppose
+        # - select all subtrees that have a class
+        # - materialize DOM
+
+        # Safe-HTM8?  This is a filter
+        return 0
 
     else:
         raise RuntimeError('Invalid action %r' % action)
 
 
 if __name__ == '__main__':
-    main(sys.argv)
+    sys.exit(main(sys.argv))
diff --git a/soil/web-init.sh b/soil/web-init.sh
@@ -155,7 +155,7 @@ deploy-data() {
 soil-web-manifest() {
   PYTHONPATH=. /usr/bin/env python2 \
     build/dynamic_deps.py py-manifest soil.web \
-  | grep oilshell/oil  # only stuff in the repo
+  | grep oils-for-unix/oils  # only stuff in the repo
 
   # Add a shell script
   echo $PWD/soil/web.sh soil/web.sh

diff --git a/soil/worker.sh b/soil/worker.sh
@@ -346,6 +346,7 @@ osh-usage         test/osh-usage.sh soil-run             -
 tools-deps        test/tools-deps.sh soil-run            -
 docs              build/doc.sh soil-run                  _release/VERSION/index.html
 doc-metrics       echo no-op                             _release/VERSION/doc/metrics.txt
+check-docs        data_lang/htm8-test.sh soil-run        -
 EOF
 # doc-metrics is a no-op, just for the link.  Because soil-run just runs the
 # release, which creates metrics.