[data_lang] Add test for HTM8 (aka HT8 )

I might want to call it HT8 to reduce confusion. HTM is a common file extension. I learned about rules for: - <script> <style> <textarea> - CDATA vs RCDATA I think we may just skip those altogether. Well we can have special lexing rules to treat them as opaque text. If we find those, then we just search for the ending </script> or </style>.
oils-for-unix · Jan 6, 2025 · 57b0b18 · 57b0b18
1 parent 549fa1a
commit 57b0b18
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 6 deletions.
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
@@ -3,14 +3,20 @@
 # Usage:
 #   data_lang/htm8-test.sh
 
-: ${LIB_OSH=stdlib/osh}
+REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
+
+# Special case: we need $REPO_ROOT
+: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
 source $LIB_OSH/bash-strict.sh
 source $LIB_OSH/task-five.sh
 
 # parse with lazylex/html.py, or data_lang/htm8.py
 
 site-files() {
-  find ../../oilshell/oilshell.org__deploy -name '*.html'
+  #find ../../oilshell/oilshell.org__deploy -name '*.html'
+
+  # omit all the _ files
+  git ls-files | grep '\.html$'
 }
 
 # Issues with lazylex/html.py
@@ -20,19 +26,87 @@ site-files() {
 #   - can we change that with [.\n]*?
 # - nongreedy match for --> and ?>
 
+ht8-tool() {
+  PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
+    $REPO_ROOT/lazylex/html.py "$@"
+}
+
+test-well-formed() {
+  cat >_tmp/bad.html <<EOF
+hi && bye
+EOF
+  echo '_tmp/bad.html' | ht8-tool well-formed 
+}
 
+# site errors
+#
+# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
+# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
+# 5833374 tokens in 4710 files
+#
+# The second is the "Woboq" browser, which has CDATA
+# Ah I wonder if we need that.
+
+# Takes ~13 seconds
 test-site() {
-  # 1.5 M lines of HTML - takes 3 xargs invocations!
-  # 
   # TODO: 
-  # - test that it lexes
+  # - test that the top level lexes
+  #   - test that each tag lexers
+  #     - test that each quoted attribute lexes
   # - test that tags are balanced
 
-  site-files | xargs wc -l
+  pushd ../../oilshell/oilshell.org__deploy 
+
+  # Too many files
+  # site-files | xargs wc -l | grep total
+
+  # Not using xargs
+  time site-files | $REPO_ROOT/$0 ht8-tool well-formed
+
+  popd
 }
 
 test-wwz() {
   echo 'TODO: download .wwz from CI'
 }
 
 task-five "$@"
+exit
+
+
+echo '
+In HTML5, instead of
+<script>
+<![CDATA[
+  if (x < y) { ... }
+]]>
+</script>
+
+You can write
+
+<script>
+ if (x < y) { ... }
+</script>
+
+<script> <style> <textarea>
+
+These have special escaping rules.  I guess we just do NOT lex them at all?
+We can totally SKIP them.
+
+CDATA vs. RCDATA
+
+<textarea>
+  &lt;p&gt;  <!-- This will show as: <p> -->
+  &amp;    <!-- This will show as: & -->
+</textarea>
+
+<script>
+  &lt;p&gt;  <!-- This will show literally as: &lt;p&gt; -->
+  &amp;     <!-- This will show literally as: &amp; -->
+</script>
+
+The main practical difference is that RCDATA processes HTML entities while
+CDATA treats them as literal text. Both modes ignore HTML tags (treating them
+as plain text) except for their own closing tag.  '
+'
+
diff --git a/lazylex/html.py b/lazylex/html.py
@@ -515,3 +515,39 @@ def ToText(s, left_pos=0, right_pos=-1):
 
     out.PrintTheRest()
     return f.getvalue()
+
+
+def main(argv):
+    action = argv[1]
+
+    if action == 'well-formed':
+        num_tokens = 0
+        errors = []
+        i = 0
+        for line in sys.stdin:
+            name = line.strip()
+            with open(name) as f:
+                contents = f.read()
+
+            lx = ValidTokens(contents)
+            try:
+                tokens = list(lx)
+            except LexError as e:
+                log('Error in %r: %s', name, e)
+                errors.append((name, e))
+            else:
+                num_tokens += len(tokens)
+            #print('%d %s' % (len(tokens), name))
+            i += 1
+
+        log('%d tokens in %d files', num_tokens, i)
+        if 0:
+            for name, e in errors:
+                log('Error in %r: %s', name, e)
+
+    else:
+        raise RuntimeError('Invalid action %r' % action)
+
+
+if __name__ == '__main__':
+    main(sys.argv)