From 31ae20f723efdb2715c2413c373dac799bd44036 Mon Sep 17 00:00:00 2001 From: Andy C Date: Tue, 14 Jan 2025 13:53:40 -0500 Subject: [PATCH] [lazylex refactor] Move another enum to htm8.asdl Also fix more type errors in doctools/ --- data_lang/htm8.asdl | 8 ++++++ doctools/oils_doc.py | 1 + doctools/split_doc.py | 3 ++- doctools/src_tree.py | 4 +++ lazylex/html.py | 57 ++++++++++++++----------------------------- lazylex/html_test.py | 12 --------- 6 files changed, 33 insertions(+), 52 deletions(-) diff --git a/data_lang/htm8.asdl b/data_lang/htm8.asdl index 70bdbeae0..1b8c9045c 100644 --- a/data_lang/htm8.asdl +++ b/data_lang/htm8.asdl @@ -21,4 +21,12 @@ module htm8 | Invalid | EndOfStream generate [no_namespace_suffix] # cosmetic: call it h8_id, not h8_id_e + + + h8_tag_id = + TagName + | AttrName + | UnquotedValue | QuotedValue | MissingValue + generate [no_namespace_suffix] } + diff --git a/doctools/oils_doc.py b/doctools/oils_doc.py index e90bc1f23..ef97313fb 100755 --- a/doctools/oils_doc.py +++ b/doctools/oils_doc.py @@ -38,6 +38,7 @@ class _Abbrev(object): def __init__(self, fmt): + # type: (str) -> None self.fmt = fmt def __call__(self, value): diff --git a/doctools/split_doc.py b/doctools/split_doc.py index b0f3865cb..2fbd7a373 100755 --- a/doctools/split_doc.py +++ b/doctools/split_doc.py @@ -6,7 +6,7 @@ import optparse import re import sys -from typing import Dict, IO +from typing import List, Dict, IO DATE_RE = re.compile(r'(\d\d\d\d) / (\d\d) / (\d\d)', re.VERBOSE) @@ -126,6 +126,7 @@ def Options(): def main(argv): + # type: (List[str]) -> None o = Options() opts, argv = o.parse_args(argv) diff --git a/doctools/src_tree.py b/doctools/src_tree.py index 1440a77d3..d2d742b62 100755 --- a/doctools/src_tree.py +++ b/doctools/src_tree.py @@ -22,6 +22,8 @@ import shutil import sys +from vendor.typing import IO + from doctools.util import log from doctools import html_head from test import wild_report @@ -259,6 +261,7 @@ class DirNode: """ def __init__(self): + # type: () -> None self.files = {} # filename -> attrs dict self.dirs = {} # subdir name -> DirNode object @@ -378,6 +381,7 @@ def WriteDirsHtml(node, out_dir, rel_path='', base_url=''): def ReadNetString(in_f): + # type: (IO[str]) -> str digits = [] for i in xrange(10): # up to 10 digits diff --git a/lazylex/html.py b/lazylex/html.py index 3710d2f7f..40db7feec 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -15,7 +15,8 @@ """ from __future__ import print_function -from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str +from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, h8_tag_id, + h8_tag_id_t, h8_tag_id_str) from typing import Dict, Iterator, Any, IO try: @@ -135,28 +136,6 @@ def Print(self, s): self.f.write(s) -# HTML Tokens -# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible -TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split( -) - - -class Tok(object): - """ - Avoid lint errors by using these aliases - """ - pass - - -TOKEN_NAMES = [None] * len(TOKENS) # type: List[str] - -this_module = sys.modules[__name__] -for i, tok_str in enumerate(TOKENS): - setattr(this_module, tok_str, i) - setattr(Tok, tok_str, i) - TOKEN_NAMES[i] = tok_str - - def MakeLexer(rules): return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules] @@ -424,7 +403,7 @@ def _Tokens(s, left_pos, right_pos): def ValidTokens(s, left_pos=0, right_pos=-1): - # type: (str, int, int) -> Iterator[Tuple[int, int]] + # type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]] """Wrapper around _Tokens to prevent callers from having to handle Invalid. I'm not combining the two functions because I might want to do a @@ -509,8 +488,6 @@ def ValidTokenList(s, no_special_tags=False): )? ''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE) -TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5) - class TagLexer(object): """ @@ -560,13 +537,14 @@ def GetSpanForAttrValue(self, attr_name): try: while True: tok_id, start, end = next(events) - if tok_id == AttrName: + if tok_id == h8_tag_id.AttrName: name = self.s[start:end] if name == attr_name: # The value should come next tok_id, start, end = next(events) - assert tok_id in (QuotedValue, UnquotedValue, - MissingValue), h8_id_str(tok_id) + assert tok_id in ( + h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue, + h8_tag_id.MissingValue), h8_tag_id_str(tok_id) val = start, end break @@ -594,13 +572,14 @@ def AllAttrsRawSlice(self): try: while True: tok_id, start, end = next(events) - if tok_id == AttrName: + if tok_id == h8_tag_id.AttrName: name = self.s[start:end] # The value should come next tok_id, start, end = next(events) - assert tok_id in (QuotedValue, UnquotedValue, - MissingValue), h8_id_str(tok_id) + assert tok_id in ( + h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue, + h8_tag_id.MissingValue), h8_tag_id_str(tok_id) # Note: quoted values may have & # We would need ANOTHER lexer to unescape them, but we # don't need that for ul-table @@ -624,7 +603,7 @@ def AllAttrsRaw(self): return pairs def Tokens(self): - # type: () -> Iterator[Tuple[int, int, int]] + # type: () -> Iterator[Tuple[h8_tag_id_t, int, int]] """ Yields a sequence of tokens: Tag (AttrName AttrValue?)* @@ -637,7 +616,7 @@ def Tokens(self): if not m: raise RuntimeError("Couldn't find HTML tag in %r" % self.TagString()) - yield TagName, m.start(1), m.end(1) + yield h8_tag_id.TagName, m.start(1), m.end(1) pos = m.end(0) #log('POS %d', pos) @@ -650,21 +629,21 @@ def Tokens(self): break #log('AttrName %r', m.group(1)) - yield AttrName, m.start(1), m.end(1) + yield h8_tag_id.AttrName, m.start(1), m.end(1) #log('m.groups() %r', m.groups()) if m.group(2) is not None: # double quoted - yield QuotedValue, m.start(2), m.end(2) + yield h8_tag_id.QuotedValue, m.start(2), m.end(2) elif m.group(3) is not None: # single quoted - TODO: could have different token types - yield QuotedValue, m.start(3), m.end(3) + yield h8_tag_id.QuotedValue, m.start(3), m.end(3) elif m.group(4) is not None: - yield UnquotedValue, m.start(4), m.end(4) + yield h8_tag_id.UnquotedValue, m.start(4), m.end(4) else: #