From 6f81c12869d247528d0198770cf9e8165fe9940b Mon Sep 17 00:00:00 2001 From: Andy C Date: Tue, 14 Jan 2025 12:29:40 -0500 Subject: [PATCH] [doctools refactor] Use token IDs in in the new htm8.asdl Update benchmarks2 job to build _devbuild.gen.htm8_asdl --- devtools/refactor.sh | 15 +++++ doctools/cmark.py | 9 ++- doctools/help_gen.py | 5 +- doctools/oils_doc.py | 16 +++--- doctools/ul_table.py | 66 +++++++++++----------- lazylex/html.py | 132 +++++++++++++++++++++---------------------- lazylex/html_test.py | 101 +++++++++++++++++---------------- soil/worker.sh | 3 + 8 files changed, 187 insertions(+), 160 deletions(-) diff --git a/devtools/refactor.sh b/devtools/refactor.sh index 306ce40433..72133e9605 100755 --- a/devtools/refactor.sh +++ b/devtools/refactor.sh @@ -292,4 +292,19 @@ singleton-primitive() { echo } +htm8() { + for prefix in Tok html; do + for name in \ + Decl Comment CommentBegin Processing ProcessingBegin \ + CData CDataBegin \ + StartTag StartEndTag EndTag \ + DecChar HexChar CharEntity \ + RawData HtmlCData \ + BadAmpersand BadGreaterThan BadLessThan \ + Invalid EndOfStream; do + sed -i "s/$prefix.$name/h8_id.$name/g" */*.py + done +done +} + task-five "$@" diff --git a/doctools/cmark.py b/doctools/cmark.py index 864af41289..c6bc412680 100755 --- a/doctools/cmark.py +++ b/doctools/cmark.py @@ -1,6 +1,11 @@ #!/usr/bin/env python2 -"""Convert markdown to HTML, then parse the HTML, generate and insert a TOC, -and insert anchors. +"""Convert Markdown to HTML, with our enhancements + +- Parse the HTML +- insert a TOC +- hack - this is obsolete with ul-table? +- Expand $xref links +- Highlight code blocks I started from cmark-0.28.3/wrappers/wrapper.py. """ diff --git a/doctools/help_gen.py b/doctools/help_gen.py index 623945be3a..1fa069d6b3 100755 --- a/doctools/help_gen.py +++ b/doctools/help_gen.py @@ -35,6 +35,7 @@ import re import sys +from _devbuild.gen.htm8_asdl import h8_id from doctools import html_lib from doctools.util import log from lazylex import html @@ -309,7 +310,7 @@ def ExtractBody(s): except StopIteration: break - if tok_id == html.StartTag: + if tok_id == h8_id.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'body': body_start_right = end_pos # right after @@ -364,7 +365,7 @@ def HelpTopics(s): except StopIteration: break - if tok_id == html.StartTag: + if tok_id == h8_id.StartTag: tag_lexer.Reset(pos, end_pos) #log('%r', tag_lexer.TagString()) #log('%r', tag_lexer.TagName()) diff --git a/doctools/oils_doc.py b/doctools/oils_doc.py index e5de856c34..e90bc1f232 100755 --- a/doctools/oils_doc.py +++ b/doctools/oils_doc.py @@ -11,6 +11,8 @@ """ from __future__ import print_function +from _devbuild.gen.htm8_asdl import h8_id + import cgi from typing import Iterator from typing import Any @@ -121,7 +123,7 @@ def ExpandLinks(s): except StopIteration: break - if tok_id == html.StartTag: + if tok_id == h8_id.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'a': @@ -343,7 +345,7 @@ def SimpleHighlightCode(s): except StopIteration: break - if tok_id == html.StartTag: + if tok_id == h8_id.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'pre': @@ -403,7 +405,7 @@ def HighlightCode(s, default_highlighter, debug_out=None): except StopIteration: break - if tok_id == html.StartTag: + if tok_id == h8_id.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'pre': @@ -416,7 +418,7 @@ def HighlightCode(s, default_highlighter, debug_out=None): break tag_lexer.Reset(pos, end_pos) - if tok_id == html.StartTag and tag_lexer.TagName() == 'code': + if tok_id == h8_id.StartTag and tag_lexer.TagName() == 'code': css_class = tag_lexer.GetAttrRaw('class') code_start_pos = end_pos @@ -514,7 +516,7 @@ def HighlightCode(s, default_highlighter, debug_out=None): except StopIteration: break tag_lexer.Reset(slash_code_right, end_pos) - assert tok_id == html.EndTag, tok_id + assert tok_id == h8_id.EndTag, tok_id assert (tag_lexer.TagName() == 'pre' ), tag_lexer.TagName() slash_pre_right = end_pos @@ -559,7 +561,7 @@ def ExtractCode(s, f): except StopIteration: break - if tok_id == html.StartTag: + if tok_id == h8_id.StartTag: tag_lexer.Reset(pos, end_pos) if tag_lexer.TagName() == 'pre': pre_start_pos = pos @@ -571,7 +573,7 @@ def ExtractCode(s, f): break tag_lexer.Reset(pos, end_pos) - if tok_id == html.StartTag and tag_lexer.TagName() == 'code': + if tok_id == h8_id.StartTag and tag_lexer.TagName() == 'code': css_class = tag_lexer.GetAttrRaw('class') # Skip code blocks that look like ```foo diff --git a/doctools/ul_table.py b/doctools/ul_table.py index 53a8332e76..6c76f70423 100755 --- a/doctools/ul_table.py +++ b/doctools/ul_table.py @@ -1,6 +1,8 @@ #!/usr/bin/env python2 """ul_table.py: Markdown Tables Without New Syntax.""" +from _devbuild.gen.htm8_asdl import h8_id, h8_id_str + try: from cStringIO import StringIO except ImportError: @@ -32,7 +34,7 @@ def RemoveComments(s): pos = 0 for tok_id, end_pos in html.ValidTokens(s): - if tok_id == html.Comment: + if tok_id == h8_id.Comment: value = s[pos:end_pos] # doc/release-index.md has etc. if 'REPLACE' not in value: @@ -54,7 +56,7 @@ def __init__(self, lexer, tag_lexer): self.lexer = lexer self.tag_lexer = tag_lexer - self.tok_id = html.Invalid + self.tok_id = h8_id.Invalid self.start_pos = 0 self.end_pos = 0 @@ -73,7 +75,7 @@ def _Next(self, comment_ok=False): # Should have called RemoveComments() beforehand. That can still leave # some REPLACE cmoments - if not comment_ok and self.tok_id == html.Comment: + if not comment_ok and self.tok_id == h8_id.Comment: raise html.ParseError('Unexpected HTML comment') if 0: @@ -85,9 +87,9 @@ def _EatRawData(self, regex): """ Assert that we got text data matching a regex, and advance """ - if self.tok_id != html.RawData: + if self.tok_id != h8_id.RawData: raise html.ParseError('Expected RawData, got %s' % - html.TokenName(self.tok_id)) + h8_id_str(self.tok_id)) actual = self._CurrentString() m = re.match(regex, actual) # could compile this if m is None: @@ -101,16 +103,16 @@ def _Eat(self, expected_id, expected_tag): Assert that we got a start or end tag, with the given name, and advance Args: - expected_id: html.StartTag or html.EndTag + expected_id: h8_id.StartTag or h8_id.EndTag expected_tag: 'a', 'span', etc. """ - assert expected_id in (html.StartTag, - html.EndTag), html.TokenName(expected_id) + assert expected_id in (h8_id.StartTag, + h8_id.EndTag), h8_id_str(expected_id) if self.tok_id != expected_id: raise html.ParseError( 'Expected token %s, got %s' % - (html.TokenName(expected_id), html.TokenName(self.tok_id))) + (h8_id_str(expected_id), h8_id_str(self.tok_id))) self.tag_lexer.Reset(self.start_pos, self.end_pos) tag_name = self.tag_lexer.TagName() if expected_tag != tag_name: @@ -124,7 +126,7 @@ def _WhitespaceOk(self): """ Optional whitespace """ - if (self.tok_id == html.RawData and + if (self.tok_id == h8_id.RawData and _WHITESPACE_RE.match(self.lexer.s, self.start_pos)): self._Next() @@ -140,19 +142,19 @@ def FindUlTable(self): # Find first table while True: self._Next(comment_ok=True) - if self.tok_id == html.EndOfStream: + if self.tok_id == h8_id.EndOfStream: return -1 tag_lexer.Reset(self.start_pos, self.end_pos) - if (self.tok_id == html.StartTag and + if (self.tok_id == h8_id.StartTag and tag_lexer.TagName() == 'table'): while True: self._Next(comment_ok=True) - if self.tok_id != html.RawData: + if self.tok_id != h8_id.RawData: break tag_lexer.Reset(self.start_pos, self.end_pos) - if (self.tok_id == html.StartTag and + if (self.tok_id == h8_id.StartTag and tag_lexer.TagName() == 'ul'): return self.start_pos return -1 @@ -186,14 +188,14 @@ def _ListItem(self): """ self._WhitespaceOk() - if self.tok_id != html.StartTag: + if self.tok_id != h8_id.StartTag: return None, None inner_html = None td_attrs = None # Can we also have col-attrs? td_attrs_span = None - self._Eat(html.StartTag, 'li') + self._Eat(h8_id.StartTag, 'li') left = self.start_pos @@ -202,7 +204,7 @@ def _ListItem(self): # because cells can have bulleted lists balance = 0 while True: - if self.tok_id == html.StartEndTag: + if self.tok_id == h8_id.StartEndTag: self.tag_lexer.Reset(self.start_pos, self.end_pos) tag_name = self.tag_lexer.TagName() # TODO: remove td-attrs backward compat @@ -211,12 +213,12 @@ def _ListItem(self): td_attrs = self.tag_lexer.AllAttrsRaw() #log('CELL ATTRS %r', self._CurrentString()) - elif self.tok_id == html.StartTag: + elif self.tok_id == h8_id.StartTag: self.tag_lexer.Reset(self.start_pos, self.end_pos) if self.tag_lexer.TagName() == 'li': balance += 1 - elif self.tok_id == html.EndTag: + elif self.tok_id == h8_id.EndTag: self.tag_lexer.Reset(self.start_pos, self.end_pos) if self.tag_lexer.TagName() == 'li': balance -= 1 @@ -236,7 +238,7 @@ def _ListItem(self): inner_html = s[left:right] #log('RAW inner html %r', inner_html) - #self._Eat(html.EndTag, 'li') + #self._Eat(h8_id.EndTag, 'li') self._Next() return td_attrs, inner_html @@ -284,7 +286,7 @@ def _ParseTHead(self): cells = [] self._WhitespaceOk() - self._Eat(html.StartTag, 'li') + self._Eat(h8_id.StartTag, 'li') # In CommonMark, r'thead\n' is enough, because it strips trailing # whitespace. I'm not sure if other Markdown processors do that, so @@ -292,7 +294,7 @@ def _ParseTHead(self): self._EatRawData(r'thead\s+') # This is the row data - self._Eat(html.StartTag, 'ul') + self._Eat(h8_id.StartTag, 'ul') while True: td_attrs, inner_html = self._ListItem() @@ -301,10 +303,10 @@ def _ParseTHead(self): cells.append((td_attrs, inner_html)) self._WhitespaceOk() - self._Eat(html.EndTag, 'ul') + self._Eat(h8_id.EndTag, 'ul') self._WhitespaceOk() - self._Eat(html.EndTag, 'li') + self._Eat(h8_id.EndTag, 'li') #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id]) return cells @@ -334,15 +336,15 @@ def _ParseTr(self): self._WhitespaceOk() # Could be a - if self.tok_id != html.StartTag: + if self.tok_id != h8_id.StartTag: return None, None - self._Eat(html.StartTag, 'li') + self._Eat(h8_id.StartTag, 'li') self._EatRawData(r'tr\s*') tr_attrs = None - if self.tok_id == html.StartEndTag: + if self.tok_id == h8_id.StartEndTag: self.tag_lexer.Reset(self.start_pos, self.end_pos) tag_name = self.tag_lexer.TagName() if tag_name != 'row-attrs': @@ -352,7 +354,7 @@ def _ParseTr(self): self._WhitespaceOk() # This is the row data - self._Eat(html.StartTag, 'ul') + self._Eat(h8_id.StartTag, 'ul') while True: td_attrs, inner_html = self._ListItem() @@ -363,10 +365,10 @@ def _ParseTr(self): self._WhitespaceOk() - self._Eat(html.EndTag, 'ul') + self._Eat(h8_id.EndTag, 'ul') self._WhitespaceOk() - self._Eat(html.EndTag, 'li') + self._Eat(h8_id.EndTag, 'li') #log('_ParseTHead %s ', html.TOKEN_NAMES[self.tok_id]) return tr_attrs, cells @@ -394,7 +396,7 @@ def ParseTable(self): table = {'tr': []} ul_start = self.start_pos - self._Eat(html.StartTag, 'ul') + self._Eat(h8_id.StartTag, 'ul') # Look ahead 2 or 3 tokens: if self.lexer.LookAhead(r'\s*
  • thead\s+'): @@ -416,7 +418,7 @@ def ParseTable(self): #log('___ TR %s', tr) table['tr'].append((tr_attrs, tr)) - self._Eat(html.EndTag, 'ul') + self._Eat(h8_id.EndTag, 'ul') self._WhitespaceOk() diff --git a/lazylex/html.py b/lazylex/html.py index 5704a22053..cb93c8201d 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -14,6 +14,8 @@ """ from __future__ import print_function + +from _devbuild.gen.htm8_asdl import h8_id, h8_id_str from typing import Iterator from typing import Union from typing import Any @@ -41,7 +43,7 @@ class LexError(Exception): """ Examples of lex errors: - - Tok.Invalid, like <> or && + - h8_id.Invalid, like <> or && - Unclosed ', Tok.Comment), +#(r'', h8_id.Comment), # Hack from Claude: \s\S instead of re.DOTALL. I don't like this -#(r'', Tok.Comment), -#(r'', Tok.Comment), +#(r'', h8_id.Comment), +#(r'', h8_id.Comment), HTM8_LEX_COMPILED = MakeLexer(HTM8_LEX) @@ -306,7 +303,7 @@ def _Peek(self): Note: not using _Peek() now """ if self.pos == self.right_pos: - return Tok.EndOfStream, self.pos + return h8_id.EndOfStream, self.pos assert self.pos < self.right_pos, self.pos @@ -322,7 +319,7 @@ def _Peek(self): raise LexError(self.s, self.pos) self.search_state = None # beginning - return Tok.HtmlCData, pos + return h8_id.HtmlCData, pos # Find the first match. # Note: frontend/match.py uses _LongestMatch(), which is different! @@ -331,7 +328,7 @@ def _Peek(self): for pat, tok_id in HTM8_LEX_COMPILED: m = pat.match(self.s, self.pos) if m: - if tok_id in (Tok.StartTag, Tok.EndTag, Tok.StartEndTag): + if tok_id in (h8_id.StartTag, h8_id.EndTag, h8_id.StartEndTag): self.tag_pos_left = m.start(1) self.tag_pos_right = m.end(1) else: @@ -339,28 +336,28 @@ def _Peek(self): self.tag_pos_left = -1 self.tag_pos_right = -1 - if tok_id == Tok.CommentBegin: + if tok_id == h8_id.CommentBegin: pos = self.s.find('-->', self.pos) if pos == -1: # unterminated + return h8_id.Comment, pos + 3 # --> - if tok_id == Tok.ProcessingBegin: + if tok_id == h8_id.ProcessingBegin: pos = self.s.find('?>', self.pos) if pos == -1: # unterminated + return h8_id.Processing, pos + 2 # ?> - if tok_id == Tok.CDataBegin: + if tok_id == h8_id.CDataBegin: pos = self.s.find(']]>', self.pos) if pos == -1: # unterminated + return h8_id.CData, pos + 3 # ]]> - if tok_id == Tok.StartTag: + if tok_id == h8_id.StartTag: # TODO: reduce allocations if (self.TagNameEquals('script') or self.TagNameEquals('style')): @@ -369,7 +366,7 @@ def _Peek(self): return tok_id, m.end() else: - raise AssertionError('Tok.Invalid rule should have matched') + raise AssertionError('h8_id.Invalid rule should have matched') def TagNameEquals(self, expected): # type: (str) -> bool @@ -427,7 +424,7 @@ def _Tokens(s, left_pos, right_pos): while True: tok_id, pos = lx.Read() yield tok_id, pos - if tok_id == Tok.EndOfStream: + if tok_id == h8_id.EndOfStream: break @@ -441,7 +438,7 @@ def ValidTokens(s, left_pos=0, right_pos=-1): """ pos = left_pos for tok_id, end_pos in _Tokens(s, left_pos, right_pos): - if tok_id == Tok.Invalid: + if tok_id == h8_id.Invalid: raise LexError(s, pos) yield tok_id, end_pos pos = end_pos @@ -457,9 +454,9 @@ def ValidTokenList(s, no_special_tags=False): while True: tok_id, end_pos = lx.Read() tokens.append((tok_id, end_pos)) - if tok_id == Tok.EndOfStream: + if tok_id == h8_id.EndOfStream: break - if tok_id == Tok.Invalid: + if tok_id == h8_id.Invalid: raise LexError(s, start_pos) start_pos = end_pos return tokens @@ -572,7 +569,7 @@ def GetSpanForAttrValue(self, attr_name): # The value should come next tok_id, start, end = next(events) assert tok_id in (QuotedValue, UnquotedValue, - MissingValue), TokenName(tok_id) + MissingValue), h8_id_str(tok_id) val = start, end break @@ -606,7 +603,7 @@ def AllAttrsRawSlice(self): # The value should come next tok_id, start, end = next(events) assert tok_id in (QuotedValue, UnquotedValue, - MissingValue), TokenName(tok_id) + MissingValue), h8_id_str(tok_id) # Note: quoted values may have & # We would need ANOTHER lexer to unescape them, but we # don't need that for ul-table @@ -691,8 +688,8 @@ def Tokens(self): # Note: for unquoted values, & isn't allowed, and thus & and c and # ™ are not allowed. We could relax that? ATTR_VALUE_LEXER = CHAR_LEX + [ - (r'[^>&\x00]+', Tok.RawData), - (r'.', Tok.Invalid), + (r'[^>&\x00]+', h8_id.RawData), + (r'.', h8_id.Invalid), ] ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER) @@ -725,7 +722,7 @@ def NumTokens(self): num_tokens = 0 pos = self.start_pos for tok_id, end_pos in self.Tokens(): - if tok_id == Tok.Invalid: + if tok_id == h8_id.Invalid: raise LexError(self.s, pos) pos = end_pos #log('pos %d', pos) @@ -751,7 +748,7 @@ def Tokens(self): pos = end_pos break else: - raise AssertionError('Tok.Invalid rule should have matched') + raise AssertionError('h8_id.Invalid rule should have matched') def ReadUntilStartTag(it, tag_lexer, tag_name): @@ -768,7 +765,7 @@ def ReadUntilStartTag(it, tag_lexer, tag_name): except StopIteration: break tag_lexer.Reset(pos, end_pos) - if tok_id == Tok.StartTag and tag_lexer.TagName() == tag_name: + if tok_id == h8_id.StartTag and tag_lexer.TagName() == tag_name: return pos, end_pos pos = end_pos @@ -791,7 +788,7 @@ def ReadUntilEndTag(it, tag_lexer, tag_name): except StopIteration: break tag_lexer.Reset(pos, end_pos) - if tok_id == Tok.EndTag and tag_lexer.TagName() == tag_name: + if tok_id == h8_id.EndTag and tag_lexer.TagName() == tag_name: return pos, end_pos pos = end_pos @@ -828,12 +825,12 @@ def ToText(s, left_pos=0, right_pos=-1): pos = left_pos for tok_id, end_pos in ValidTokens(s, left_pos, right_pos): - if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan, - Tok.BadLessThan): + if tok_id in (h8_id.RawData, h8_id.BadAmpersand, h8_id.BadGreaterThan, + h8_id.BadLessThan): out.SkipTo(pos) out.PrintUntil(end_pos) - elif tok_id == Tok.CharEntity: # & + elif tok_id == h8_id.CharEntity: # & entity = s[pos + 1:end_pos - 1] @@ -842,10 +839,10 @@ def ToText(s, left_pos=0, right_pos=-1): out.SkipTo(end_pos) # Not handling these yet - elif tok_id == Tok.HexChar: + elif tok_id == h8_id.HexChar: raise AssertionError('Hex Char %r' % s[pos:pos + 20]) - elif tok_id == Tok.DecChar: + elif tok_id == h8_id.DecChar: raise AssertionError('Dec Char %r' % s[pos:pos + 20]) else: @@ -895,16 +892,16 @@ def Validate(contents, flags, counters): tag_stack = [] while True: tok_id, end_pos = lx.Read() - #log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos]) + #log('TOP %s %r', h8_id_str(tok_id), contents[start_pos:end_pos]) - if tok_id == Tok.Invalid: + if tok_id == h8_id.Invalid: raise LexError(contents, start_pos) - if tok_id == Tok.EndOfStream: + if tok_id == h8_id.EndOfStream: break tokens.append((tok_id, end_pos)) - if tok_id == Tok.StartEndTag: + if tok_id == h8_id.StartEndTag: counters.num_start_end_tags += 1 tag_lexer.Reset(start_pos, end_pos) @@ -916,7 +913,7 @@ def Validate(contents, flags, counters): counters.debug_attrs.extend(all_attrs) - elif tok_id == Tok.StartTag: + elif tok_id == h8_id.StartTag: counters.num_start_tags += 1 tag_lexer.Reset(start_pos, end_pos) @@ -939,7 +936,7 @@ def Validate(contents, flags, counters): counters.max_tag_stack = max(counters.max_tag_stack, len(tag_stack)) - elif tok_id == Tok.EndTag: + elif tok_id == h8_id.EndTag: if flags & BALANCED_TAGS: try: expected = tag_stack.pop() @@ -991,14 +988,15 @@ def ToXml(htm8_str): while True: tok_id, end_pos = lx.Read() - if tok_id == Tok.Invalid: + if tok_id == h8_id.Invalid: raise LexError(htm8_str, pos) - if tok_id == Tok.EndOfStream: + if tok_id == h8_id.EndOfStream: break - if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar): + if tok_id in (h8_id.RawData, h8_id.CharEntity, h8_id.HexChar, + h8_id.DecChar): out.PrintUntil(end_pos) - elif tok_id in (Tok.StartTag, Tok.StartEndTag): + elif tok_id in (h8_id.StartTag, h8_id.StartEndTag): tag_lexer.Reset(pos, end_pos) # TODO: reduce allocations here all_attrs = tag_lexer.AllAttrsRawSlice() @@ -1014,16 +1012,16 @@ def ToXml(htm8_str): # Missing : add ="", so missing becomes missing="" tag_name = lx.CanonicalTagName() - if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS: + if tok_id == h8_id.StartTag and tag_name in VOID_ELEMENTS: # TODO: instead of closing >, print /> pass - elif tok_id == Tok.BadAmpersand: + elif tok_id == h8_id.BadAmpersand: #out.SkipTo(pos) out.Print('&') out.SkipTo(end_pos) - elif tok_id == Tok.BadGreaterThan: + elif tok_id == h8_id.BadGreaterThan: #out.SkipTo(pos) out.Print('>') out.SkipTo(end_pos) @@ -1060,13 +1058,13 @@ def main(argv): start_pos = 0 while True: tok_id, end_pos = lx.Read() - if tok_id == Tok.Invalid: + if tok_id == h8_id.Invalid: raise LexError(contents, start_pos) - if tok_id == Tok.EndOfStream: + if tok_id == h8_id.EndOfStream: break frag = contents[start_pos:end_pos] - log('%d %s %r', end_pos, TokenName(tok_id), frag) + log('%d %s %r', end_pos, h8_id_str(tok_id), frag) start_pos = end_pos return 0 diff --git a/lazylex/html_test.py b/lazylex/html_test.py index ebe0677b17..f91cad7bf1 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -3,9 +3,10 @@ import unittest +from _devbuild.gen.htm8_asdl import h8_id, h8_id_str from lazylex import html # module under test log = html.log -from typing import List -from typing import Tuple + +from typing import List, Tuple log = html.log @@ -184,7 +185,7 @@ def Lex(h, no_special_tags=False): start_pos = 0 for tok_id, end_pos in tokens: frag = h[start_pos:end_pos] - log('%d %s %r', end_pos, html.TokenName(tok_id), frag) + log('%d %s %r', end_pos, h8_id_str(tok_id), frag) start_pos = end_pos return tokens @@ -219,10 +220,10 @@ def testCommentParse2(self): self.assertEqual( [ - (Tok.RawData, 12), - (Tok.Comment, 50), # - (Tok.StartEndTag, 55), - (Tok.EndOfStream, 55), + (h8_id.RawData, 12), + (h8_id.Comment, 50), # + (h8_id.StartEndTag, 55), + (h8_id.EndOfStream, 55), ], tokens) @@ -235,9 +236,9 @@ def testProcessingInstruction(self): self.assertEqual( [ - (Tok.RawData, 3), - (Tok.Processing, 12), # - (Tok.EndOfStream, 12), + (h8_id.RawData, 3), + (h8_id.Processing, 12), # + (h8_id.EndOfStream, 12), ], tokens) @@ -251,12 +252,12 @@ def testScriptStyle(self): tokens = Lex(h) expected = [ - (Tok.RawData, 12), - (Tok.StartTag, 27), # - (Tok.RawData, 96), # \n - (Tok.EndOfStream, 96), # \n + (h8_id.RawData, 12), + (h8_id.StartTag, 27), # + (h8_id.RawData, 96), # \n + (h8_id.EndOfStream, 96), # \n ] self.assertEqual(expected, tokens) @@ -273,13 +274,13 @@ def testScriptStyleXml(self): self.assertEqual( [ - (Tok.RawData, 3), - (Tok.StartTag, 18), # - (Tok.RawData, 24), # \n - (Tok.EndTag, 33), # \n - (Tok.EndOfStream, 33), # \n + (h8_id.RawData, 3), + (h8_id.StartTag, 18), # + (h8_id.RawData, 24), # \n + (h8_id.EndTag, 33), # \n + (h8_id.EndOfStream, 33), # \n ], tokens) @@ -293,10 +294,10 @@ def testCData(self): tokens = Lex(h) self.assertEqual([ - (Tok.StartTag, 9), - (Tok.CData, 61), - (Tok.EndTag, 71), - (Tok.EndOfStream, 71), + (h8_id.StartTag, 9), + (h8_id.CData, 61), + (h8_id.EndTag, 71), + (h8_id.EndOfStream, 71), ], tokens) def testEntity(self): @@ -310,11 +311,11 @@ def testEntity(self): tokens = Lex(h) self.assertEqual([ - (Tok.CharEntity, 6), - (Tok.RawData, 8), - (Tok.CharEntity, 14), - (Tok.RawData, 15), - (Tok.EndOfStream, 15), + (h8_id.CharEntity, 6), + (h8_id.RawData, 8), + (h8_id.CharEntity, 14), + (h8_id.RawData, 15), + (h8_id.EndOfStream, 15), ], tokens) def testStartTag(self): @@ -325,10 +326,10 @@ def testStartTag(self): tokens = Lex(h) self.assertEqual([ - (Tok.StartTag, 3), - (Tok.RawData, 5), - (Tok.EndTag, 9), - (Tok.EndOfStream, 9), + (h8_id.StartTag, 3), + (h8_id.RawData, 5), + (h8_id.EndTag, 9), + (h8_id.EndOfStream, 9), ], tokens) # Make sure we don't consume too much @@ -337,12 +338,12 @@ def testStartTag(self): tokens = Lex(h) self.assertEqual([ - (Tok.StartTag, 3), - (Tok.StartTag, 11), - (Tok.RawData, 14), - (Tok.EndTag, 23), - (Tok.EndTag, 27), - (Tok.EndOfStream, 27), + (h8_id.StartTag, 3), + (h8_id.StartTag, 11), + (h8_id.RawData, 14), + (h8_id.EndTag, 23), + (h8_id.EndTag, 27), + (h8_id.EndOfStream, 27), ], tokens) return @@ -355,10 +356,10 @@ def testStartTag(self): tokens = Lex(h) self.assertEqual([ - (Tok.RawData, 9), - (Tok.StartTag, 24), - (Tok.RawData, 9), - (Tok.EndOfStream, 9), + (h8_id.RawData, 9), + (h8_id.StartTag, 24), + (h8_id.RawData, 9), + (h8_id.EndOfStream, 9), ], tokens) def testBad(self): @@ -369,16 +370,16 @@ def testBad(self): tokens = Lex(h) self.assertEqual([ - (Tok.BadAmpersand, 1), - (Tok.EndOfStream, 1), + (h8_id.BadAmpersand, 1), + (h8_id.EndOfStream, 1), ], tokens) h = '>' tokens = Lex(h) self.assertEqual([ - (Tok.BadGreaterThan, 1), - (Tok.EndOfStream, 1), + (h8_id.BadGreaterThan, 1), + (h8_id.EndOfStream, 1), ], tokens) def testInvalid(self): diff --git a/soil/worker.sh b/soil/worker.sh index 802e24fea5..75960d2f70 100755 --- a/soil/worker.sh +++ b/soil/worker.sh @@ -232,9 +232,12 @@ os-info soil/diagnose.sh os-info - dump-env soil/diagnose.sh dump-env - wait-for-tarball soil/wait.sh for-cpp-tarball - test-tar devtools/release-native.sh test-tar - +build-minimal build/py.sh minimal - uftrace benchmarks/uftrace.sh soil-run _tmp/uftrace/index.html gc-cachegrind benchmarks/gc-cachegrind.sh soil-run _tmp/gc-cachegrind/index.html EOF + # 2025-01: added build-minimal because benchmarks use cmark.py, which uses + # htm8_asdl to ExpandLinks() and so forth } cpp-spec-tasks() {