From 06efda8006e88f396193253570d7b307db5cd858 Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Sat, 11 Jan 2025 11:55:39 -0500
Subject: [PATCH] [doctools/micro-syntax] Start HTM8 lexer.

The motivation was to test some HTM8 rules in re2c.

It works fine!  We can distinguish StartEndTag with /> vs. StartTag with
>.

If the matches are equal length, then it picks the first one.

Our rule in lazylex/html.py is different - the first match - but it also
works in this case.
---
 doctools/micro-syntax.sh     | 14 ++++++++++++
 doctools/micro_syntax.cc     | 19 +++++++++++++++-
 doctools/micro_syntax.re2c.h | 43 ++++++++++++++++++++++++++++++++++++
 lazylex/html.py              | 10 +++++----
 4 files changed, 81 insertions(+), 5 deletions(-)
diff --git a/doctools/micro-syntax.sh b/doctools/micro-syntax.sh
index f6af485738..7afc43b4e4 100755
--- a/doctools/micro-syntax.sh
+++ b/doctools/micro-syntax.sh
@@ -213,6 +213,13 @@ readonly -a R_TESTS=(
   3')"
 )
 
+readonly -a HTML_TESTS=(
+  '<p>hi</p>'
+  'hi <br/>'
+  '<img src="foo"/>'
+  '<a href=foo>link</a>'
+)
+
 run-cases() {
   local lang=$1
   shift
@@ -246,6 +253,12 @@ test-R() {
   run-cases R "${R_TESTS[@]}"
 }
 
+test-html() {
+  build
+  run-cases html "${HTML_TESTS[@]}"
+}
+
+
 run-tests() {
   local bin=$BASE_DIR/micro_syntax
 
@@ -255,6 +268,7 @@ run-tests() {
   run-cases cpp "${CPP_TESTS[@]}"
   run-cases py "${PY_TESTS[@]}"
   run-cases R "${R_TESTS[@]}"
+  run-cases html "${HTML_TESTS[@]}"
 
   # No language specified
   echo '==== No language'
diff --git a/doctools/micro_syntax.cc b/doctools/micro_syntax.cc
index e3cb64de58..814169cae0 100644
--- a/doctools/micro_syntax.cc
+++ b/doctools/micro_syntax.cc
@@ -49,6 +49,7 @@ enum class lang_e {
   Py,
   Shell,
   Ysh,  // ''' etc.
+  Html,
   Asdl,
   R,  // uses # comments
 
@@ -328,6 +329,15 @@ class AnsiPrinter : public Printer {
       PrintColor(GREEN, p_start, num_bytes);
       break;
 
+    case Id::StartTag:
+    case Id::EndTag:
+      PrintColor(PURPLE, p_start, num_bytes);
+      break;
+
+    case Id::StartEndTag:
+      PrintColor(RED2, p_start, num_bytes);
+      break;
+
     case Id::Unknown:
       // Make errors red
       fputs(REVERSE, stdout);
@@ -955,6 +965,10 @@ int ScanFiles(const Flags& flag, std::vector<char*> files, OutputStream* out,
       status = ScanOne<R_mode_e>(reader, out, hook);
       break;
 
+    case lang_e::Html:
+      status = ScanOne<html_mode_e>(reader, out, hook);
+      break;
+
     default:
       assert(0);
     }
@@ -1040,6 +1054,9 @@ int main(int argc, char** argv) {
       } else if (strcmp(optarg, "yaml") == 0) {
         flag.lang = lang_e::PlainText;
 
+      } else if (strcmp(optarg, "html") == 0) {
+        flag.lang = lang_e::Html;
+
       } else if (strcmp(optarg, "txt") == 0) {
         flag.lang = lang_e::PlainText;
 
@@ -1047,7 +1064,7 @@ int main(int argc, char** argv) {
         flag.lang = lang_e::PlainText;
 
       } else {
-        Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|txt, "
+        Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|html|txt, "
             "got %s",
             optarg);
         return 2;
diff --git a/doctools/micro_syntax.re2c.h b/doctools/micro_syntax.re2c.h
index 7f847c5d76..5a218f63aa 100644
--- a/doctools/micro_syntax.re2c.h
+++ b/doctools/micro_syntax.re2c.h
@@ -41,6 +41,11 @@ enum class Id {
   HereBegin,
   HereEnd,
 
+  // Html
+  StartTag,
+  EndTag,
+  StartEndTag,
+
   // Zero-width token to detect #ifdef and Python INDENT/DEDENT
   // StartLine,
 
@@ -669,6 +674,44 @@ bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
   return false;
 }
 
+enum class html_mode_e {
+  Outer,
+};
+
+// Returns whether EOL was hit
+template <>
+bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
+  const char* p = lexer->p_current;  // mutated by re2c
+  const char* YYMARKER = p;
+
+  switch (lexer->line_mode) {
+  case html_mode_e::Outer:
+    while (true) {
+      /*!re2c
+        nul       { return true; }
+
+        // Like _NAME in HTM8
+        name = [a-zA-Z][a-zA-Z0-9:_-]* ;
+
+        '</' name '>' { TOK(Id::EndTag); }
+        '<' name [^>\x00]* '/>' { TOK(Id::StartEndTag); }
+        '<' name [^>\x00]* '>' { TOK(Id::StartTag); }
+
+        // TODO: Fill in the rest of the HTM8 lexer.
+
+        *                      { TOK(Id::Other); }
+
+      */
+    }
+    break;
+  }
+
+  tok->end_col = p - lexer->line_;
+  lexer->p_current = p;
+  return false;
+}
+
+
 // TODO:
 // - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
 //   - same as C++ raw string, I think
diff --git a/lazylex/html.py b/lazylex/html.py
index a80fd65e31..abdff063ce 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -218,7 +218,8 @@ def MakeLexer(rules):
     #   That's done in the tag lexer.
     # - We don't allow leading whitespace
     (r'</ (%s) >' % _NAME, Tok.EndTag),
-    # self-closing <br/>  comes before StarttTag
+    # self-closing <br/>  comes before StartTag
+    # could/should these be collapsed into one rule?
     (r'<  (%s) [^>]* />' % _NAME, Tok.StartEndTag),  # end </a>
     (r'<  (%s) [^>]* >' % _NAME, Tok.StartTag),  # start <a>
 
@@ -441,7 +442,9 @@ def ValidTokenList(s, no_special_tags=False):
 #
 # Allow - for td-attrs
 
-_ATTR_VALUE = r'[a-zA-Z0-9_\-]+'  # allow hyphens
+# allow underscore/hyphen.  what about colons, like _NAME?
+# what about href=$foo ?
+_UNQUOTED_VALUE = r'[a-zA-Z0-9_\-]+'
 
 # TODO: we don't need to capture the tag name here?  That's done at the top
 # level
@@ -459,10 +462,9 @@ def ValidTokenList(s, no_special_tags=False):
   (?:
     " ([^>"]*) "        # double quoted value
   | (%s)                # Attribute value
-                        # TODO: relax this?  for href=$foo
   )
 )?             
-''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
+''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
 
 TagName, AttrName, UnquotedValue, QuotedValue = range(4)