From 57b0b186a91f4ac6802ec45bd90be137cbf26375 Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Mon, 6 Jan 2025 14:39:55 -0500
Subject: [PATCH] [data_lang] Add test for HTM8 (aka HT8 )

I might want to call it HT8 to reduce confusion.  HTM is a common file
extension.

I learned about rules for:

- <script> <style> <textarea>
- CDATA vs RCDATA

I think we may just skip those altogether.  Well we can have special
lexing rules to treat them as opaque text.  If we find those, then we
just search for the ending </script> or </style>.
---
 data_lang/htm8-test.sh | 86 +++++++++++++++++++++++++++++++++++++++---
 lazylex/html.py        | 36 ++++++++++++++++++
 2 files changed, 116 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 lazylex/html.py
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
index 16fee7de7..6c3c72367 100755
--- a/data_lang/htm8-test.sh
+++ b/data_lang/htm8-test.sh
@@ -3,14 +3,20 @@
 # Usage:
 #   data_lang/htm8-test.sh
 
-: ${LIB_OSH=stdlib/osh}
+REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
+
+# Special case: we need $REPO_ROOT
+: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
 source $LIB_OSH/bash-strict.sh
 source $LIB_OSH/task-five.sh
 
 # parse with lazylex/html.py, or data_lang/htm8.py
 
 site-files() {
-  find ../../oilshell/oilshell.org__deploy -name '*.html'
+  #find ../../oilshell/oilshell.org__deploy -name '*.html'
+
+  # omit all the _ files
+  git ls-files | grep '\.html$'
 }
 
 # Issues with lazylex/html.py
@@ -20,15 +26,44 @@ site-files() {
 #   - can we change that with [.\n]*?
 # - nongreedy match for --> and ?>
 
+ht8-tool() {
+  PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
+    $REPO_ROOT/lazylex/html.py "$@"
+}
+
+test-well-formed() {
+  cat >_tmp/bad.html <<EOF
+hi && bye
+EOF
+  echo '_tmp/bad.html' | ht8-tool well-formed 
+}
 
+# site errors
+#
+# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
+# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
+# 5833374 tokens in 4710 files
+#
+# The second is the "Woboq" browser, which has CDATA
+# Ah I wonder if we need that.
+
+# Takes ~13 seconds
 test-site() {
-  # 1.5 M lines of HTML - takes 3 xargs invocations!
-  # 
   # TODO: 
-  # - test that it lexes
+  # - test that the top level lexes
+  #   - test that each tag lexers
+  #     - test that each quoted attribute lexes
   # - test that tags are balanced
 
-  site-files | xargs wc -l
+  pushd ../../oilshell/oilshell.org__deploy 
+
+  # Too many files
+  # site-files | xargs wc -l | grep total
+
+  # Not using xargs
+  time site-files | $REPO_ROOT/$0 ht8-tool well-formed
+
+  popd
 }
 
 test-wwz() {
@@ -36,3 +71,42 @@ test-wwz() {
 }
 
 task-five "$@"
+exit
+
+
+echo '
+In HTML5, instead of
+<script>
+<![CDATA[
+  if (x < y) { ... }
+]]>
+</script>
+
+You can write
+
+<script>
+ if (x < y) { ... }
+</script>
+
+<script> <style> <textarea>
+
+These have special escaping rules.  I guess we just do NOT lex them at all?
+We can totally SKIP them.
+
+CDATA vs. RCDATA
+
+<textarea>
+  &lt;p&gt;  <!-- This will show as: <p> -->
+  &amp;    <!-- This will show as: & -->
+</textarea>
+
+<script>
+  &lt;p&gt;  <!-- This will show literally as: &lt;p&gt; -->
+  &amp;     <!-- This will show literally as: &amp; -->
+</script>
+
+The main practical difference is that RCDATA processes HTML entities while
+CDATA treats them as literal text. Both modes ignore HTML tags (treating them
+as plain text) except for their own closing tag.  '
+'
+
diff --git a/lazylex/html.py b/lazylex/html.py
old mode 100644
new mode 100755
index 03cab4355..f77311267
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -515,3 +515,39 @@ def ToText(s, left_pos=0, right_pos=-1):
 
     out.PrintTheRest()
     return f.getvalue()
+
+
+def main(argv):
+    action = argv[1]
+
+    if action == 'well-formed':
+        num_tokens = 0
+        errors = []
+        i = 0
+        for line in sys.stdin:
+            name = line.strip()
+            with open(name) as f:
+                contents = f.read()
+
+            lx = ValidTokens(contents)
+            try:
+                tokens = list(lx)
+            except LexError as e:
+                log('Error in %r: %s', name, e)
+                errors.append((name, e))
+            else:
+                num_tokens += len(tokens)
+            #print('%d %s' % (len(tokens), name))
+            i += 1
+
+        log('%d tokens in %d files', num_tokens, i)
+        if 0:
+            for name, e in errors:
+                log('Error in %r: %s', name, e)
+
+    else:
+        raise RuntimeError('Invalid action %r' % action)
+
+
+if __name__ == '__main__':
+    main(sys.argv)