From 6f81c12869d247528d0198770cf9e8165fe9940b Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Tue, 14 Jan 2025 12:29:40 -0500
Subject: [PATCH] [doctools refactor] Use token IDs in in the new htm8.asdl

Update benchmarks2 job to build _devbuild.gen.htm8_asdl
---
 devtools/refactor.sh |  15 +++++
 doctools/cmark.py    |   9 ++-
 doctools/help_gen.py |   5 +-
 doctools/oils_doc.py |  16 +++---
 doctools/ul_table.py |  66 +++++++++++-----------
 lazylex/html.py      | 132 +++++++++++++++++++++----------------------
 lazylex/html_test.py | 101 +++++++++++++++++----------------
 soil/worker.sh       |   3 +
 8 files changed, 187 insertions(+), 160 deletions(-)
diff --git a/devtools/refactor.sh b/devtools/refactor.sh
index 306ce40433..72133e9605 100755
--- a/devtools/refactor.sh
+++ b/devtools/refactor.sh
@@ -292,4 +292,19 @@ singleton-primitive() {
   echo
 }
 
+htm8() {
+  for prefix in Tok html; do
+    for name in \
+      Decl Comment CommentBegin Processing ProcessingBegin \
+      CData CDataBegin \
+  StartTag StartEndTag EndTag  \
+  DecChar HexChar CharEntity  \
+  RawData HtmlCData \
+  BadAmpersand BadGreaterThan BadLessThan  \
+  Invalid EndOfStream; do
+      sed -i "s/$prefix.$name/h8_id.$name/g" */*.py
+    done
+done
+}
+
 task-five "$@"
diff --git a/doctools/cmark.py b/doctools/cmark.py
index 864af41289..c6bc412680 100755
--- a/doctools/cmark.py
+++ b/doctools/cmark.py
@@ -1,6 +1,11 @@
 #!/usr/bin/env python2
-"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC,
-and insert anchors.
+"""Convert Markdown to HTML, with our enhancements
+
+- Parse the HTML
+- insert a TOC
+- <pstrip> hack - this is obsolete with ul-table?
+- Expand $xref links
+- Highlight code blocks
 
 I started from cmark-0.28.3/wrappers/wrapper.py.
 """
diff --git a/doctools/help_gen.py b/doctools/help_gen.py
index 623945be3a..1fa069d6b3 100755
--- a/doctools/help_gen.py
+++ b/doctools/help_gen.py
@@ -35,6 +35,7 @@
 import re
 import sys
 
+from _devbuild.gen.htm8_asdl import h8_id
 from doctools import html_lib
 from doctools.util import log
 from lazylex import html
@@ -309,7 +310,7 @@ def ExtractBody(s):
         except StopIteration:
             break
 
-        if tok_id == html.StartTag:
+        if tok_id == h8_id.StartTag:
             tag_lexer.Reset(pos, end_pos)
             if tag_lexer.TagName() == 'body':
                 body_start_right = end_pos  # right after <body>
@@ -364,7 +365,7 @@ def HelpTopics(s):
         except StopIteration:
             break
 
-        if tok_id == html.StartTag:
+        if tok_id == h8_id.StartTag:
             tag_lexer.Reset(pos, end_pos)
             #log('%r', tag_lexer.TagString())
             #log('%r', tag_lexer.TagName())
diff --git a/doctools/oils_doc.py b/doctools/oils_doc.py
index e5de856c34..e90bc1f232 100755
--- a/doctools/oils_doc.py
+++ b/doctools/oils_doc.py
@@ -11,6 +11,8 @@
 """
 from __future__ import print_function
 
+from _devbuild.gen.htm8_asdl import h8_id
+
 import cgi
 from typing import Iterator
 from typing import Any
@@ -121,7 +123,7 @@ def ExpandLinks(s):
         except StopIteration:
             break
 
-        if tok_id == html.StartTag:
+        if tok_id == h8_id.StartTag:
 
             tag_lexer.Reset(pos, end_pos)
             if tag_lexer.TagName() == 'a':
@@ -343,7 +345,7 @@ def SimpleHighlightCode(s):
         except StopIteration:
             break
 
-        if tok_id == html.StartTag:
+        if tok_id == h8_id.StartTag:
 
             tag_lexer.Reset(pos, end_pos)
             if tag_lexer.TagName() == 'pre':
@@ -403,7 +405,7 @@ def HighlightCode(s, default_highlighter, debug_out=None):
         except StopIteration:
             break
 
-        if tok_id == html.StartTag:
+        if tok_id == h8_id.StartTag:
 
             tag_lexer.Reset(pos, end_pos)
             if tag_lexer.TagName() == 'pre':
@@ -416,7 +418,7 @@ def HighlightCode(s, default_highlighter, debug_out=None):
                     break
 
                 tag_lexer.Reset(pos, end_pos)
-                if tok_id == html.StartTag and tag_lexer.TagName() == 'code':
+                if tok_id == h8_id.StartTag and tag_lexer.TagName() == 'code':
 
                     css_class = tag_lexer.GetAttrRaw('class')
                     code_start_pos = end_pos
@@ -514,7 +516,7 @@ def HighlightCode(s, default_highlighter, debug_out=None):
                             except StopIteration:
                                 break
                             tag_lexer.Reset(slash_code_right, end_pos)
-                            assert tok_id == html.EndTag, tok_id
+                            assert tok_id == h8_id.EndTag, tok_id
                             assert (tag_lexer.TagName() == 'pre'
                                     ), tag_lexer.TagName()
                             slash_pre_right = end_pos
@@ -559,7 +561,7 @@ def ExtractCode(s, f):
         except StopIteration:
             break
 
-        if tok_id == html.StartTag:
+        if tok_id == h8_id.StartTag:
             tag_lexer.Reset(pos, end_pos)
             if tag_lexer.TagName() == 'pre':
                 pre_start_pos = pos
@@ -571,7 +573,7 @@ def ExtractCode(s, f):
                     break
 
                 tag_lexer.Reset(pos, end_pos)
-                if tok_id == html.StartTag and tag_lexer.TagName() == 'code':
+                if tok_id == h8_id.StartTag and tag_lexer.TagName() == 'code':
 
                     css_class = tag_lexer.GetAttrRaw('class')
                     # Skip code blocks that look like ```foo
diff --git a/doctools/ul_table.py b/doctools/ul_table.py
index 53a8332e76..6c76f70423 100755
--- a/doctools/ul_table.py
+++ b/doctools/ul_table.py
@@ -1,6 +1,8 @@
 #!/usr/bin/env python2
 """ul_table.py: Markdown Tables Without New Syntax."""
 
+from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
+
 try:
     from cStringIO import StringIO
 except ImportError:
@@ -32,7 +34,7 @@ def RemoveComments(s):
     pos = 0
 
     for tok_id, end_pos in html.ValidTokens(s):
-        if tok_id == html.Comment:
+        if tok_id == h8_id.Comment:
             value = s[pos:end_pos]
             # doc/release-index.md has <!-- REPLACE_WITH_DATE --> etc.
             if 'REPLACE' not in value:
@@ -54,7 +56,7 @@ def __init__(self, lexer, tag_lexer):
         self.lexer = lexer
         self.tag_lexer = tag_lexer
 
-        self.tok_id = html.Invalid
+        self.tok_id = h8_id.Invalid
         self.start_pos = 0
         self.end_pos = 0
 
@@ -73,7 +75,7 @@ def _Next(self, comment_ok=False):
 
         # Should have called RemoveComments() beforehand.  That can still leave
         # some REPLACE cmoments
-        if not comment_ok and self.tok_id == html.Comment:
+        if not comment_ok and self.tok_id == h8_id.Comment:
             raise html.ParseError('Unexpected HTML comment')
 
         if 0:
@@ -85,9 +87,9 @@ def _EatRawData(self, regex):
         """
         Assert that we got text data matching a regex, and advance
         """
-        if self.tok_id != html.RawData:
+        if self.tok_id != h8_id.RawData:
             raise html.ParseError('Expected RawData, got %s' %
-                                  html.TokenName(self.tok_id))
+                                  h8_id_str(self.tok_id))
         actual = self._CurrentString()
         m = re.match(regex, actual)  # could compile this
         if m is None:
@@ -101,16 +103,16 @@ def _Eat(self, expected_id, expected_tag):
         Assert that we got a start or end tag, with the given name, and advance
 
         Args:
-          expected_id: html.StartTag or html.EndTag
+          expected_id: h8_id.StartTag or h8_id.EndTag
           expected_tag: 'a', 'span', etc.
         """
-        assert expected_id in (html.StartTag,
-                               html.EndTag), html.TokenName(expected_id)
+        assert expected_id in (h8_id.StartTag,
+                               h8_id.EndTag), h8_id_str(expected_id)
 
         if self.tok_id != expected_id:
             raise html.ParseError(
                 'Expected token %s, got %s' %
-                (html.TokenName(expected_id), html.TokenName(self.tok_id)))
+                (h8_id_str(expected_id), h8_id_str(self.tok_id)))
         self.tag_lexer.Reset(self.start_pos, self.end_pos)
         tag_name = self.tag_lexer.TagName()
         if expected_tag != tag_name:
@@ -124,7 +126,7 @@ def _WhitespaceOk(self):
         """
         Optional whitespace
         """
-        if (self.tok_id == html.RawData and
+        if (self.tok_id == h8_id.RawData and
                 _WHITESPACE_RE.match(self.lexer.s, self.start_pos)):
             self._Next()
 
@@ -140,19 +142,19 @@ def FindUlTable(self):
         # Find first table
         while True:
             self._Next(comment_ok=True)
-            if self.tok_id == html.EndOfStream:
+            if self.tok_id == h8_id.EndOfStream:
                 return -1
 
             tag_lexer.Reset(self.start_pos, self.end_pos)
-            if (self.tok_id == html.StartTag and
+            if (self.tok_id == h8_id.StartTag and
                     tag_lexer.TagName() == 'table'):
                 while True:
                     self._Next(comment_ok=True)
-                    if self.tok_id != html.RawData:
+                    if self.tok_id != h8_id.RawData:
                         break
 
                 tag_lexer.Reset(self.start_pos, self.end_pos)
-                if (self.tok_id == html.StartTag and
+                if (self.tok_id == h8_id.StartTag and
                         tag_lexer.TagName() == 'ul'):
                     return self.start_pos
         return -1
@@ -186,14 +188,14 @@ def _ListItem(self):
         """
         self._WhitespaceOk()
 
-        if self.tok_id != html.StartTag:
+        if self.tok_id != h8_id.StartTag:
             return None, None
 
         inner_html = None
         td_attrs = None  # Can we also have col-attrs?
         td_attrs_span = None
 
-        self._Eat(html.StartTag, 'li')
+        self._Eat(h8_id.StartTag, 'li')
 
         left = self.start_pos
 
@@ -202,7 +204,7 @@ def _ListItem(self):
         # because cells can have bulleted lists
         balance = 0
         while True:
-            if self.tok_id == html.StartEndTag:
+            if self.tok_id == h8_id.StartEndTag:
                 self.tag_lexer.Reset(self.start_pos, self.end_pos)
                 tag_name = self.tag_lexer.TagName()
                 # TODO: remove td-attrs backward compat
@@ -211,12 +213,12 @@ def _ListItem(self):
                     td_attrs = self.tag_lexer.AllAttrsRaw()
                     #log('CELL ATTRS %r', self._CurrentString())
 
-            elif self.tok_id == html.StartTag:
+            elif self.tok_id == h8_id.StartTag:
                 self.tag_lexer.Reset(self.start_pos, self.end_pos)
                 if self.tag_lexer.TagName() == 'li':
                     balance += 1
 
-            elif self.tok_id == html.EndTag:
+            elif self.tok_id == h8_id.EndTag:
                 self.tag_lexer.Reset(self.start_pos, self.end_pos)
                 if self.tag_lexer.TagName() == 'li':
                     balance -= 1
@@ -236,7 +238,7 @@ def _ListItem(self):
             inner_html = s[left:right]
         #log('RAW inner html %r', inner_html)
 
-        #self._Eat(html.EndTag, 'li')
+        #self._Eat(h8_id.EndTag, 'li')
         self._Next()
 
         return td_attrs, inner_html
@@ -284,7 +286,7 @@ def _ParseTHead(self):
         cells = []
 
         self._WhitespaceOk()
-        self._Eat(html.StartTag, 'li')
+        self._Eat(h8_id.StartTag, 'li')
 
         # In CommonMark, r'thead\n' is enough, because it strips trailing
         # whitespace.  I'm not sure if other Markdown processors do that, so
@@ -292,7 +294,7 @@ def _ParseTHead(self):
         self._EatRawData(r'thead\s+')
 
         # This is the row data
-        self._Eat(html.StartTag, 'ul')
+        self._Eat(h8_id.StartTag, 'ul')
 
         while True:
             td_attrs, inner_html = self._ListItem()
@@ -301,10 +303,10 @@ def _ParseTHead(self):
             cells.append((td_attrs, inner_html))
         self._WhitespaceOk()
 
-        self._Eat(html.EndTag, 'ul')
+        self._Eat(h8_id.EndTag, 'ul')
 
         self._WhitespaceOk()
-        self._Eat(html.EndTag, 'li')
+        self._Eat(h8_id.EndTag, 'li')
 
         #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
         return cells
@@ -334,15 +336,15 @@ def _ParseTr(self):
         self._WhitespaceOk()
 
         # Could be a </ul>
-        if self.tok_id != html.StartTag:
+        if self.tok_id != h8_id.StartTag:
             return None, None
 
-        self._Eat(html.StartTag, 'li')
+        self._Eat(h8_id.StartTag, 'li')
 
         self._EatRawData(r'tr\s*')
 
         tr_attrs = None
-        if self.tok_id == html.StartEndTag:
+        if self.tok_id == h8_id.StartEndTag:
             self.tag_lexer.Reset(self.start_pos, self.end_pos)
             tag_name = self.tag_lexer.TagName()
             if tag_name != 'row-attrs':
@@ -352,7 +354,7 @@ def _ParseTr(self):
             self._WhitespaceOk()
 
         # This is the row data
-        self._Eat(html.StartTag, 'ul')
+        self._Eat(h8_id.StartTag, 'ul')
 
         while True:
             td_attrs, inner_html = self._ListItem()
@@ -363,10 +365,10 @@ def _ParseTr(self):
 
         self._WhitespaceOk()
 
-        self._Eat(html.EndTag, 'ul')
+        self._Eat(h8_id.EndTag, 'ul')
 
         self._WhitespaceOk()
-        self._Eat(html.EndTag, 'li')
+        self._Eat(h8_id.EndTag, 'li')
 
         #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id])
         return tr_attrs, cells
@@ -394,7 +396,7 @@ def ParseTable(self):
         table = {'tr': []}
 
         ul_start = self.start_pos
-        self._Eat(html.StartTag, 'ul')
+        self._Eat(h8_id.StartTag, 'ul')
 
         # Look ahead 2 or 3 tokens:
         if self.lexer.LookAhead(r'\s*<li>thead\s+'):
@@ -416,7 +418,7 @@ def ParseTable(self):
             #log('___ TR %s', tr)
             table['tr'].append((tr_attrs, tr))
 
-        self._Eat(html.EndTag, 'ul')
+        self._Eat(h8_id.EndTag, 'ul')
 
         self._WhitespaceOk()
 
diff --git a/lazylex/html.py b/lazylex/html.py
index 5704a22053..cb93c8201d 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -14,6 +14,8 @@
 
 """
 from __future__ import print_function
+
+from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
 from typing import Iterator
 from typing import Union
 from typing import Any
@@ -41,7 +43,7 @@ class LexError(Exception):
     """
     Examples of lex errors:
 
-    - Tok.Invalid, like <> or &&
+    - h8_id.Invalid, like <> or &&
     - Unclosed <!--  <?  <![CDATA[  <script>  <style>
     """
 
@@ -158,11 +160,6 @@ class Tok(object):
     TOKEN_NAMES[i] = tok_str
 
 
-def TokenName(tok_id):
-    # type: (int) -> str
-    return TOKEN_NAMES[tok_id]
-
-
 def MakeLexer(rules):
     return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
 
@@ -207,15 +204,15 @@ def MakeLexer(rules):
 CHAR_LEX = [
     # Characters
     # https://www.w3.org/TR/xml/#sec-references
-    (r'&\# [0-9]+ ;', Tok.DecChar),
-    (r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
-    (r'& %s ;' % _NAME, Tok.CharEntity),
+    (r'&\# [0-9]+ ;', h8_id.DecChar),
+    (r'&\# x[0-9a-fA-F]+ ;', h8_id.HexChar),
+    (r'& %s ;' % _NAME, h8_id.CharEntity),
     # Allow unquoted, and quoted
-    (r'&', Tok.BadAmpersand),
+    (r'&', h8_id.BadAmpersand),
 ]
 
 HTM8_LEX = CHAR_LEX + [
-    (r'<!--', Tok.CommentBegin),
+    (r'<!--', h8_id.CommentBegin),
 
     # Processing instruction are used for the XML header:
     # <?xml version="1.0" encoding="UTF-8"?>
@@ -224,9 +221,9 @@ def MakeLexer(rules):
     #
     #   https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
     #
-    (r'<\?', Tok.ProcessingBegin),
+    (r'<\?', h8_id.ProcessingBegin),
     # Not necessary in HTML5, but occurs in XML
-    (r'<!\[CDATA\[', Tok.CDataBegin),  # <![CDATA[
+    (r'<!\[CDATA\[', h8_id.CDataBegin),  # <![CDATA[
 
     # Markup declarations
     # - In HTML5, there is only <!DOCTYPE html>
@@ -234,28 +231,28 @@ def MakeLexer(rules):
     #   - these seem to be part of DTD
     #   - it's useful to skip these, and be able to parse the rest of the document
     # - Note: < is allowed?
-    (r'<! [^>\x00]+ >', Tok.Decl),
+    (r'<! [^>\x00]+ >', h8_id.Decl),
 
     # Tags
     # Notes:
     # - We look for a valid tag name, but we don't validate attributes.
     #   That's done in the tag lexer.
     # - We don't allow leading whitespace
-    (r'</ (%s) >' % _NAME, Tok.EndTag),
+    (r'</ (%s) >' % _NAME, h8_id.EndTag),
     # self-closing <br/>  comes before StartTag
     # could/should these be collapsed into one rule?
-    (r'<  (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag),  # end </a>
-    (r'<  (%s) [^>\x00]* >' % _NAME, Tok.StartTag),  # start <a>
+    (r'<  (%s) [^>\x00]* />' % _NAME, h8_id.StartEndTag),  # end </a>
+    (r'<  (%s) [^>\x00]* >' % _NAME, h8_id.StartTag),  # start <a>
 
     # HTML5 allows unescaped > in raw data, but < is not allowed.
     # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
     #
     # - My early blog has THREE errors when disallowing >
     # - So do some .wwz files
-    (r'[^&<>\x00]+', Tok.RawData),
-    (r'>', Tok.BadGreaterThan),
+    (r'[^&<>\x00]+', h8_id.RawData),
+    (r'>', h8_id.BadGreaterThan),
     # < is an error
-    (r'.', Tok.Invalid),
+    (r'.', h8_id.Invalid),
 ]
 
 #  Old notes:
@@ -271,11 +268,11 @@ def MakeLexer(rules):
 # https://re2c.org/manual/manual_c.html
 
 # Discarded options
-#(r'<!-- .*? -->', Tok.Comment),
+#(r'<!-- .*? -->', h8_id.Comment),
 
 # Hack from Claude: \s\S instead of re.DOTALL.  I don't like this
-#(r'<!-- [\s\S]*? -->', Tok.Comment),
-#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
+#(r'<!-- [\s\S]*? -->', h8_id.Comment),
+#(r'<!-- (?:.|[\n])*? -->', h8_id.Comment),
 
 HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX)
 
@@ -306,7 +303,7 @@ def _Peek(self):
         Note: not using _Peek() now
         """
         if self.pos == self.right_pos:
-            return Tok.EndOfStream, self.pos
+            return h8_id.EndOfStream, self.pos
 
         assert self.pos < self.right_pos, self.pos
 
@@ -322,7 +319,7 @@ def _Peek(self):
                 raise LexError(self.s, self.pos)
             self.search_state = None
             # beginning
-            return Tok.HtmlCData, pos
+            return h8_id.HtmlCData, pos
 
         # Find the first match.
         # Note: frontend/match.py uses _LongestMatch(), which is different!
@@ -331,7 +328,7 @@ def _Peek(self):
         for pat, tok_id in HTM8_LEX_COMPILED:
             m = pat.match(self.s, self.pos)
             if m:
-                if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag):
+                if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag):
                     self.tag_pos_left = m.start(1)
                     self.tag_pos_right = m.end(1)
                 else:
@@ -339,28 +336,28 @@ def _Peek(self):
                     self.tag_pos_left = -1
                     self.tag_pos_right = -1
 
-                if tok_id == Tok.CommentBegin:
+                if tok_id == h8_id.CommentBegin:
                     pos = self.s.find('-->', self.pos)
                     if pos == -1:
                         # unterminated <!--
                         raise LexError(self.s, self.pos)
-                    return Tok.Comment, pos + 3  # -->
+                    return h8_id.Comment, pos + 3  # -->
 
-                if tok_id == Tok.ProcessingBegin:
+                if tok_id == h8_id.ProcessingBegin:
                     pos = self.s.find('?>', self.pos)
                     if pos == -1:
                         # unterminated <?
                         raise LexError(self.s, self.pos)
-                    return Tok.Processing, pos + 2  # ?>
+                    return h8_id.Processing, pos + 2  # ?>
 
-                if tok_id == Tok.CDataBegin:
+                if tok_id == h8_id.CDataBegin:
                     pos = self.s.find(']]>', self.pos)
                     if pos == -1:
                         # unterminated <![CDATA[
                         raise LexError(self.s, self.pos)
-                    return Tok.CData, pos + 3  # ]]>
+                    return h8_id.CData, pos + 3  # ]]>
 
-                if tok_id == Tok.StartTag:
+                if tok_id == h8_id.StartTag:
                     # TODO: reduce allocations
                     if (self.TagNameEquals('script') or
                             self.TagNameEquals('style')):
@@ -369,7 +366,7 @@ def _Peek(self):
 
                 return tok_id, m.end()
         else:
-            raise AssertionError('Tok.Invalid rule should have matched')
+            raise AssertionError('h8_id.Invalid rule should have matched')
 
     def TagNameEquals(self, expected):
         # type: (str) -> bool
@@ -427,7 +424,7 @@ def _Tokens(s, left_pos, right_pos):
     while True:
         tok_id, pos = lx.Read()
         yield tok_id, pos
-        if tok_id == Tok.EndOfStream:
+        if tok_id == h8_id.EndOfStream:
             break
 
 
@@ -441,7 +438,7 @@ def ValidTokens(s, left_pos=0, right_pos=-1):
     """
     pos = left_pos
     for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
-        if tok_id == Tok.Invalid:
+        if tok_id == h8_id.Invalid:
             raise LexError(s, pos)
         yield tok_id, end_pos
         pos = end_pos
@@ -457,9 +454,9 @@ def ValidTokenList(s, no_special_tags=False):
     while True:
         tok_id, end_pos = lx.Read()
         tokens.append((tok_id, end_pos))
-        if tok_id == Tok.EndOfStream:
+        if tok_id == h8_id.EndOfStream:
             break
-        if tok_id == Tok.Invalid:
+        if tok_id == h8_id.Invalid:
             raise LexError(s, start_pos)
         start_pos = end_pos
     return tokens
@@ -572,7 +569,7 @@ def GetSpanForAttrValue(self, attr_name):
                         # The value should come next
                         tok_id, start, end = next(events)
                         assert tok_id in (QuotedValue, UnquotedValue,
-                                          MissingValue), TokenName(tok_id)
+                                          MissingValue), h8_id_str(tok_id)
                         val = start, end
                         break
 
@@ -606,7 +603,7 @@ def AllAttrsRawSlice(self):
                     # The value should come next
                     tok_id, start, end = next(events)
                     assert tok_id in (QuotedValue, UnquotedValue,
-                                      MissingValue), TokenName(tok_id)
+                                      MissingValue), h8_id_str(tok_id)
                     # Note: quoted values may have &amp;
                     # We would need ANOTHER lexer to unescape them, but we
                     # don't need that for ul-table
@@ -691,8 +688,8 @@ def Tokens(self):
 # Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
 # &#x99; are not allowed.  We could relax that?
 ATTR_VALUE_LEXER = CHAR_LEX + [
-    (r'[^>&\x00]+', Tok.RawData),
-    (r'.', Tok.Invalid),
+    (r'[^>&\x00]+', h8_id.RawData),
+    (r'.', h8_id.Invalid),
 ]
 
 ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)
@@ -725,7 +722,7 @@ def NumTokens(self):
         num_tokens = 0
         pos = self.start_pos
         for tok_id, end_pos in self.Tokens():
-            if tok_id == Tok.Invalid:
+            if tok_id == h8_id.Invalid:
                 raise LexError(self.s, pos)
             pos = end_pos
             #log('pos %d', pos)
@@ -751,7 +748,7 @@ def Tokens(self):
                     pos = end_pos
                     break
             else:
-                raise AssertionError('Tok.Invalid rule should have matched')
+                raise AssertionError('h8_id.Invalid rule should have matched')
 
 
 def ReadUntilStartTag(it, tag_lexer, tag_name):
@@ -768,7 +765,7 @@ def ReadUntilStartTag(it, tag_lexer, tag_name):
         except StopIteration:
             break
         tag_lexer.Reset(pos, end_pos)
-        if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name:
+        if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name:
             return pos, end_pos
 
         pos = end_pos
@@ -791,7 +788,7 @@ def ReadUntilEndTag(it, tag_lexer, tag_name):
         except StopIteration:
             break
         tag_lexer.Reset(pos, end_pos)
-        if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name:
+        if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name:
             return pos, end_pos
 
         pos = end_pos
@@ -828,12 +825,12 @@ def ToText(s, left_pos=0, right_pos=-1):
 
     pos = left_pos
     for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
-        if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
-                      Tok.BadLessThan):
+        if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan,
+                      h8_id.BadLessThan):
             out.SkipTo(pos)
             out.PrintUntil(end_pos)
 
-        elif tok_id == Tok.CharEntity:  # &amp;
+        elif tok_id == h8_id.CharEntity:  # &amp;
 
             entity = s[pos + 1:end_pos - 1]
 
@@ -842,10 +839,10 @@ def ToText(s, left_pos=0, right_pos=-1):
             out.SkipTo(end_pos)
 
         # Not handling these yet
-        elif tok_id == Tok.HexChar:
+        elif tok_id == h8_id.HexChar:
             raise AssertionError('Hex Char %r' % s[pos:pos + 20])
 
-        elif tok_id == Tok.DecChar:
+        elif tok_id == h8_id.DecChar:
             raise AssertionError('Dec Char %r' % s[pos:pos + 20])
 
         else:
@@ -895,16 +892,16 @@ def Validate(contents, flags, counters):
     tag_stack = []
     while True:
         tok_id, end_pos = lx.Read()
-        #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])
+        #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos])
 
-        if tok_id == Tok.Invalid:
+        if tok_id == h8_id.Invalid:
             raise LexError(contents, start_pos)
-        if tok_id == Tok.EndOfStream:
+        if tok_id == h8_id.EndOfStream:
             break
 
         tokens.append((tok_id, end_pos))
 
-        if tok_id == Tok.StartEndTag:
+        if tok_id == h8_id.StartEndTag:
             counters.num_start_end_tags += 1
 
             tag_lexer.Reset(start_pos, end_pos)
@@ -916,7 +913,7 @@ def Validate(contents, flags, counters):
 
             counters.debug_attrs.extend(all_attrs)
 
-        elif tok_id == Tok.StartTag:
+        elif tok_id == h8_id.StartTag:
             counters.num_start_tags += 1
 
             tag_lexer.Reset(start_pos, end_pos)
@@ -939,7 +936,7 @@ def Validate(contents, flags, counters):
 
             counters.max_tag_stack = max(counters.max_tag_stack,
                                          len(tag_stack))
-        elif tok_id == Tok.EndTag:
+        elif tok_id == h8_id.EndTag:
             if flags & BALANCED_TAGS:
                 try:
                     expected = tag_stack.pop()
@@ -991,14 +988,15 @@ def ToXml(htm8_str):
     while True:
         tok_id, end_pos = lx.Read()
 
-        if tok_id == Tok.Invalid:
+        if tok_id == h8_id.Invalid:
             raise LexError(htm8_str, pos)
-        if tok_id == Tok.EndOfStream:
+        if tok_id == h8_id.EndOfStream:
             break
 
-        if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
+        if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar,
+                      h8_id.DecChar):
             out.PrintUntil(end_pos)
-        elif tok_id in (Tok.StartTag, Tok.StartEndTag):
+        elif tok_id in (h8_id.StartTag, h8_id.StartEndTag):
             tag_lexer.Reset(pos, end_pos)
             # TODO: reduce allocations here
             all_attrs = tag_lexer.AllAttrsRawSlice()
@@ -1014,16 +1012,16 @@ def ToXml(htm8_str):
                 # Missing : add ="", so missing becomes missing=""
 
             tag_name = lx.CanonicalTagName()
-            if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
+            if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS:
                 # TODO: instead of closing >, print />
                 pass
 
-        elif tok_id == Tok.BadAmpersand:
+        elif tok_id == h8_id.BadAmpersand:
             #out.SkipTo(pos)
             out.Print('&amp;')
             out.SkipTo(end_pos)
 
-        elif tok_id == Tok.BadGreaterThan:
+        elif tok_id == h8_id.BadGreaterThan:
             #out.SkipTo(pos)
             out.Print('&gt;')
             out.SkipTo(end_pos)
@@ -1060,13 +1058,13 @@ def main(argv):
         start_pos = 0
         while True:
             tok_id, end_pos = lx.Read()
-            if tok_id == Tok.Invalid:
+            if tok_id == h8_id.Invalid:
                 raise LexError(contents, start_pos)
-            if tok_id == Tok.EndOfStream:
+            if tok_id == h8_id.EndOfStream:
                 break
 
             frag = contents[start_pos:end_pos]
-            log('%d %s %r', end_pos, TokenName(tok_id), frag)
+            log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
             start_pos = end_pos
 
         return 0
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index ebe0677b17..f91cad7bf1 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -3,9 +3,10 @@
 
 import unittest
 
+from _devbuild.gen.htm8_asdl import h8_id, h8_id_str
 from lazylex import html  # module under test log = html.log
-from typing import List
-from typing import Tuple
+
+from typing import List, Tuple
 
 log = html.log
 
@@ -184,7 +185,7 @@ def Lex(h, no_special_tags=False):
     start_pos = 0
     for tok_id, end_pos in tokens:
         frag = h[start_pos:end_pos]
-        log('%d %s %r', end_pos, html.TokenName(tok_id), frag)
+        log('%d %s %r', end_pos, h8_id_str(tok_id), frag)
         start_pos = end_pos
     return tokens
 
@@ -219,10 +220,10 @@ def testCommentParse2(self):
 
         self.assertEqual(
             [
-                (Tok.RawData, 12),
-                (Tok.Comment, 50),  # <? err ?>
-                (Tok.StartEndTag, 55),
-                (Tok.EndOfStream, 55),
+                (h8_id.RawData, 12),
+                (h8_id.Comment, 50),  # <? err ?>
+                (h8_id.StartEndTag, 55),
+                (h8_id.EndOfStream, 55),
             ],
             tokens)
 
@@ -235,9 +236,9 @@ def testProcessingInstruction(self):
 
         self.assertEqual(
             [
-                (Tok.RawData, 3),
-                (Tok.Processing, 12),  # <? err ?>
-                (Tok.EndOfStream, 12),
+                (h8_id.RawData, 3),
+                (h8_id.Processing, 12),  # <? err ?>
+                (h8_id.EndOfStream, 12),
             ],
             tokens)
 
@@ -251,12 +252,12 @@ def testScriptStyle(self):
         tokens = Lex(h)
 
         expected = [
-            (Tok.RawData, 12),
-            (Tok.StartTag, 27),  # <script>
-            (Tok.HtmlCData, 78),  # JavaScript code is HTML CData
-            (Tok.EndTag, 87),  # </script>
-            (Tok.RawData, 96),  # \n
-            (Tok.EndOfStream, 96),  # \n
+            (h8_id.RawData, 12),
+            (h8_id.StartTag, 27),  # <script>
+            (h8_id.HtmlCData, 78),  # JavaScript code is HTML CData
+            (h8_id.EndTag, 87),  # </script>
+            (h8_id.RawData, 96),  # \n
+            (h8_id.EndOfStream, 96),  # \n
         ]
         self.assertEqual(expected, tokens)
 
@@ -273,13 +274,13 @@ def testScriptStyleXml(self):
 
         self.assertEqual(
             [
-                (Tok.RawData, 3),
-                (Tok.StartTag, 18),  # <script>
-                (Tok.RawData, 19),  # space
-                (Tok.CharEntity, 23),  # </script>
-                (Tok.RawData, 24),  # \n
-                (Tok.EndTag, 33),  # \n
-                (Tok.EndOfStream, 33),  # \n
+                (h8_id.RawData, 3),
+                (h8_id.StartTag, 18),  # <script>
+                (h8_id.RawData, 19),  # space
+                (h8_id.CharEntity, 23),  # </script>
+                (h8_id.RawData, 24),  # \n
+                (h8_id.EndTag, 33),  # \n
+                (h8_id.EndOfStream, 33),  # \n
             ],
             tokens)
 
@@ -293,10 +294,10 @@ def testCData(self):
         tokens = Lex(h)
 
         self.assertEqual([
-            (Tok.StartTag, 9),
-            (Tok.CData, 61),
-            (Tok.EndTag, 71),
-            (Tok.EndOfStream, 71),
+            (h8_id.StartTag, 9),
+            (h8_id.CData, 61),
+            (h8_id.EndTag, 71),
+            (h8_id.EndOfStream, 71),
         ], tokens)
 
     def testEntity(self):
@@ -310,11 +311,11 @@ def testEntity(self):
         tokens = Lex(h)
 
         self.assertEqual([
-            (Tok.CharEntity, 6),
-            (Tok.RawData, 8),
-            (Tok.CharEntity, 14),
-            (Tok.RawData, 15),
-            (Tok.EndOfStream, 15),
+            (h8_id.CharEntity, 6),
+            (h8_id.RawData, 8),
+            (h8_id.CharEntity, 14),
+            (h8_id.RawData, 15),
+            (h8_id.EndOfStream, 15),
         ], tokens)
 
     def testStartTag(self):
@@ -325,10 +326,10 @@ def testStartTag(self):
         tokens = Lex(h)
 
         self.assertEqual([
-            (Tok.StartTag, 3),
-            (Tok.RawData, 5),
-            (Tok.EndTag, 9),
-            (Tok.EndOfStream, 9),
+            (h8_id.StartTag, 3),
+            (h8_id.RawData, 5),
+            (h8_id.EndTag, 9),
+            (h8_id.EndOfStream, 9),
         ], tokens)
 
         # Make sure we don't consume too much
@@ -337,12 +338,12 @@ def testStartTag(self):
         tokens = Lex(h)
 
         self.assertEqual([
-            (Tok.StartTag, 3),
-            (Tok.StartTag, 11),
-            (Tok.RawData, 14),
-            (Tok.EndTag, 23),
-            (Tok.EndTag, 27),
-            (Tok.EndOfStream, 27),
+            (h8_id.StartTag, 3),
+            (h8_id.StartTag, 11),
+            (h8_id.RawData, 14),
+            (h8_id.EndTag, 23),
+            (h8_id.EndTag, 27),
+            (h8_id.EndOfStream, 27),
         ], tokens)
 
         return
@@ -355,10 +356,10 @@ def testStartTag(self):
         tokens = Lex(h)
 
         self.assertEqual([
-            (Tok.RawData, 9),
-            (Tok.StartTag, 24),
-            (Tok.RawData, 9),
-            (Tok.EndOfStream, 9),
+            (h8_id.RawData, 9),
+            (h8_id.StartTag, 24),
+            (h8_id.RawData, 9),
+            (h8_id.EndOfStream, 9),
         ], tokens)
 
     def testBad(self):
@@ -369,16 +370,16 @@ def testBad(self):
         tokens = Lex(h)
 
         self.assertEqual([
-            (Tok.BadAmpersand, 1),
-            (Tok.EndOfStream, 1),
+            (h8_id.BadAmpersand, 1),
+            (h8_id.EndOfStream, 1),
         ], tokens)
 
         h = '>'
         tokens = Lex(h)
 
         self.assertEqual([
-            (Tok.BadGreaterThan, 1),
-            (Tok.EndOfStream, 1),
+            (h8_id.BadGreaterThan, 1),
+            (h8_id.EndOfStream, 1),
         ], tokens)
 
     def testInvalid(self):
diff --git a/soil/worker.sh b/soil/worker.sh
index 802e24fea5..75960d2f70 100755
--- a/soil/worker.sh
+++ b/soil/worker.sh
@@ -232,9 +232,12 @@ os-info          soil/diagnose.sh os-info              -
 dump-env         soil/diagnose.sh dump-env             -
 wait-for-tarball soil/wait.sh for-cpp-tarball          -
 test-tar         devtools/release-native.sh test-tar   -
+build-minimal    build/py.sh minimal                   -
 uftrace          benchmarks/uftrace.sh soil-run        _tmp/uftrace/index.html
 gc-cachegrind    benchmarks/gc-cachegrind.sh soil-run  _tmp/gc-cachegrind/index.html
 EOF
+  # 2025-01: added build-minimal because benchmarks use cmark.py, which uses
+  # htm8_asdl to ExpandLinks() and so forth
 }
 
 cpp-spec-tasks() {