From 31ae20f723efdb2715c2413c373dac799bd44036 Mon Sep 17 00:00:00 2001
From: Andy C <andy@oilshell.org>
Date: Tue, 14 Jan 2025 13:53:40 -0500
Subject: [PATCH] [lazylex refactor] Move another enum to htm8.asdl

Also fix more type errors in doctools/
---
 data_lang/htm8.asdl   |  8 ++++++
 doctools/oils_doc.py  |  1 +
 doctools/split_doc.py |  3 ++-
 doctools/src_tree.py  |  4 +++
 lazylex/html.py       | 57 ++++++++++++++-----------------------------
 lazylex/html_test.py  | 12 ---------
 6 files changed, 33 insertions(+), 52 deletions(-)

diff --git a/data_lang/htm8.asdl b/data_lang/htm8.asdl
index 70bdbeae0..1b8c9045c 100644
--- a/data_lang/htm8.asdl
+++ b/data_lang/htm8.asdl
@@ -21,4 +21,12 @@ module htm8
   | Invalid
   | EndOfStream
   generate [no_namespace_suffix]  # cosmetic: call it h8_id, not h8_id_e
+
+
+  h8_tag_id =
+    TagName
+  | AttrName
+  | UnquotedValue | QuotedValue | MissingValue 
+  generate [no_namespace_suffix] 
 }
+
diff --git a/doctools/oils_doc.py b/doctools/oils_doc.py
index e90bc1f23..ef97313fb 100755
--- a/doctools/oils_doc.py
+++ b/doctools/oils_doc.py
@@ -38,6 +38,7 @@
 class _Abbrev(object):
 
     def __init__(self, fmt):
+        # type: (str) -> None
         self.fmt = fmt
 
     def __call__(self, value):
diff --git a/doctools/split_doc.py b/doctools/split_doc.py
index b0f3865cb..2fbd7a373 100755
--- a/doctools/split_doc.py
+++ b/doctools/split_doc.py
@@ -6,7 +6,7 @@
 import optparse
 import re
 import sys
-from typing import Dict, IO
+from typing import List, Dict, IO
 
 DATE_RE = re.compile(r'(\d\d\d\d) / (\d\d) / (\d\d)', re.VERBOSE)
 
@@ -126,6 +126,7 @@ def Options():
 
 
 def main(argv):
+    # type: (List[str]) -> None
     o = Options()
     opts, argv = o.parse_args(argv)
 
diff --git a/doctools/src_tree.py b/doctools/src_tree.py
index 1440a77d3..d2d742b62 100755
--- a/doctools/src_tree.py
+++ b/doctools/src_tree.py
@@ -22,6 +22,8 @@
 import shutil
 import sys
 
+from vendor.typing import IO
+
 from doctools.util import log
 from doctools import html_head
 from test import wild_report
@@ -259,6 +261,7 @@ class DirNode:
     """
 
     def __init__(self):
+        # type: () -> None
         self.files = {}  # filename -> attrs dict
         self.dirs = {}  # subdir name -> DirNode object
 
@@ -378,6 +381,7 @@ def WriteDirsHtml(node, out_dir, rel_path='', base_url=''):
 
 
 def ReadNetString(in_f):
+    # type: (IO[str]) -> str
 
     digits = []
     for i in xrange(10):  # up to 10 digits
diff --git a/lazylex/html.py b/lazylex/html.py
index 3710d2f7f..40db7feec 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -15,7 +15,8 @@
 """
 from __future__ import print_function
 
-from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
+from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, h8_tag_id,
+                                     h8_tag_id_t, h8_tag_id_str)
 from typing import Dict, Iterator, Any, IO
 
 try:
@@ -135,28 +136,6 @@ def Print(self, s):
         self.f.write(s)
 
 
-# HTML Tokens
-# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
-TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
-)
-
-
-class Tok(object):
-    """
-    Avoid lint errors by using these aliases
-    """
-    pass
-
-
-TOKEN_NAMES = [None] * len(TOKENS)  # type: List[str]
-
-this_module = sys.modules[__name__]
-for i, tok_str in enumerate(TOKENS):
-    setattr(this_module, tok_str, i)
-    setattr(Tok, tok_str, i)
-    TOKEN_NAMES[i] = tok_str
-
-
 def MakeLexer(rules):
     return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]
 
@@ -424,7 +403,7 @@ def _Tokens(s, left_pos, right_pos):
 
 
 def ValidTokens(s, left_pos=0, right_pos=-1):
-    # type: (str, int, int) -> Iterator[Tuple[int, int]]
+    # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
     """Wrapper around _Tokens to prevent callers from having to handle Invalid.
 
     I'm not combining the two functions because I might want to do a
@@ -509,8 +488,6 @@ def ValidTokenList(s, no_special_tags=False):
 )?             
 ''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)
 
-TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)
-
 
 class TagLexer(object):
     """
@@ -560,13 +537,14 @@ def GetSpanForAttrValue(self, attr_name):
         try:
             while True:
                 tok_id, start, end = next(events)
-                if tok_id == AttrName:
+                if tok_id == h8_tag_id.AttrName:
                     name = self.s[start:end]
                     if name == attr_name:
                         # The value should come next
                         tok_id, start, end = next(events)
-                        assert tok_id in (QuotedValue, UnquotedValue,
-                                          MissingValue), h8_id_str(tok_id)
+                        assert tok_id in (
+                            h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
+                            h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
                         val = start, end
                         break
 
@@ -594,13 +572,14 @@ def AllAttrsRawSlice(self):
         try:
             while True:
                 tok_id, start, end = next(events)
-                if tok_id == AttrName:
+                if tok_id == h8_tag_id.AttrName:
                     name = self.s[start:end]
 
                     # The value should come next
                     tok_id, start, end = next(events)
-                    assert tok_id in (QuotedValue, UnquotedValue,
-                                      MissingValue), h8_id_str(tok_id)
+                    assert tok_id in (
+                        h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
+                        h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
                     # Note: quoted values may have &amp;
                     # We would need ANOTHER lexer to unescape them, but we
                     # don't need that for ul-table
@@ -624,7 +603,7 @@ def AllAttrsRaw(self):
         return pairs
 
     def Tokens(self):
-        # type: () -> Iterator[Tuple[int, int, int]]
+        # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
         """
         Yields a sequence of tokens: Tag (AttrName AttrValue?)*
 
@@ -637,7 +616,7 @@ def Tokens(self):
         if not m:
             raise RuntimeError("Couldn't find HTML tag in %r" %
                                self.TagString())
-        yield TagName, m.start(1), m.end(1)
+        yield h8_tag_id.TagName, m.start(1), m.end(1)
 
         pos = m.end(0)
         #log('POS %d', pos)
@@ -650,21 +629,21 @@ def Tokens(self):
                 break
             #log('AttrName %r', m.group(1))
 
-            yield AttrName, m.start(1), m.end(1)
+            yield h8_tag_id.AttrName, m.start(1), m.end(1)
 
             #log('m.groups() %r', m.groups())
             if m.group(2) is not None:
                 # double quoted
-                yield QuotedValue, m.start(2), m.end(2)
+                yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
             elif m.group(3) is not None:
                 # single quoted - TODO: could have different token types
-                yield QuotedValue, m.start(3), m.end(3)
+                yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
             elif m.group(4) is not None:
-                yield UnquotedValue, m.start(4), m.end(4)
+                yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
             else:
                 # <button disabled>
                 end = m.end(0)
-                yield MissingValue, end, end
+                yield h8_tag_id.MissingValue, end, end
 
             # Skip past the "
             pos = m.end(0)
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index ca89e9f48..cad73dd5d 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -211,8 +211,6 @@ def testCommentParse(self):
 
     def testCommentParse2(self):
         # type: () -> None
-
-        Tok = html.Tok
         h = '''
         hi <!-- line 1
                 line 2 --><br/>'''
@@ -230,7 +228,6 @@ def testCommentParse2(self):
     def testProcessingInstruction(self):
         # type: () -> None
         # <?xml ?> header
-        Tok = html.Tok
         h = 'hi <? err ?>'
         tokens = Lex(h)
 
@@ -244,7 +241,6 @@ def testProcessingInstruction(self):
 
     def testScriptStyle(self):
         # type: () -> None
-        Tok = html.Tok
         h = '''
         hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
         </script>
@@ -267,7 +263,6 @@ def testScriptStyle(self):
 
     def testScriptStyleXml(self):
         # type: () -> None
-        Tok = html.Tok
         h = 'hi <script src=""> &lt; </script>'
         # XML mode
         tokens = Lex(h, no_special_tags=True)
@@ -286,7 +281,6 @@ def testScriptStyleXml(self):
 
     def testCData(self):
         # type: () -> None
-        Tok = html.Tok
 
         # from
         # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
@@ -302,7 +296,6 @@ def testCData(self):
 
     def testEntity(self):
         # type: () -> None
-        Tok = html.Tok
 
         # from
         # /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
@@ -320,7 +313,6 @@ def testEntity(self):
 
     def testStartTag(self):
         # type: () -> None
-        Tok = html.Tok
 
         h = '<a>hi</a>'
         tokens = Lex(h)
@@ -364,8 +356,6 @@ def testStartTag(self):
 
     def testBad(self):
         # type: () -> None
-        Tok = html.Tok
-
         h = '&'
         tokens = Lex(h)
 
@@ -384,8 +374,6 @@ def testBad(self):
 
     def testInvalid(self):
         # type: () -> None
-        Tok = html.Tok
-
         for s in INVALID_LEX:
             try:
                 tokens = html.ValidTokenList(s)