Skip to content

Commit

Permalink
[lazylex refactor] Move another enum to htm8.asdl
Browse files Browse the repository at this point in the history
Also fix more type errors in doctools/
  • Loading branch information
Andy C committed Jan 14, 2025
1 parent db6f187 commit 31ae20f
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 52 deletions.
8 changes: 8 additions & 0 deletions data_lang/htm8.asdl
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,12 @@ module htm8
| Invalid
| EndOfStream
generate [no_namespace_suffix] # cosmetic: call it h8_id, not h8_id_e


h8_tag_id =
TagName
| AttrName
| UnquotedValue | QuotedValue | MissingValue
generate [no_namespace_suffix]
}

1 change: 1 addition & 0 deletions doctools/oils_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
class _Abbrev(object):

def __init__(self, fmt):
# type: (str) -> None
self.fmt = fmt

def __call__(self, value):
Expand Down
3 changes: 2 additions & 1 deletion doctools/split_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import optparse
import re
import sys
from typing import Dict, IO
from typing import List, Dict, IO

DATE_RE = re.compile(r'(\d\d\d\d) / (\d\d) / (\d\d)', re.VERBOSE)

Expand Down Expand Up @@ -126,6 +126,7 @@ def Options():


def main(argv):
# type: (List[str]) -> None
o = Options()
opts, argv = o.parse_args(argv)

Expand Down
4 changes: 4 additions & 0 deletions doctools/src_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
import shutil
import sys

from vendor.typing import IO

from doctools.util import log
from doctools import html_head
from test import wild_report
Expand Down Expand Up @@ -259,6 +261,7 @@ class DirNode:
"""

def __init__(self):
# type: () -> None
self.files = {} # filename -> attrs dict
self.dirs = {} # subdir name -> DirNode object

Expand Down Expand Up @@ -378,6 +381,7 @@ def WriteDirsHtml(node, out_dir, rel_path='', base_url=''):


def ReadNetString(in_f):
# type: (IO[str]) -> str

digits = []
for i in xrange(10): # up to 10 digits
Expand Down
57 changes: 18 additions & 39 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
"""
from __future__ import print_function

from _devbuild.gen.htm8_asdl import h8_id, h8_id_t, h8_id_str
from _devbuild.gen.htm8_asdl import (h8_id, h8_id_t, h8_id_str, h8_tag_id,
h8_tag_id_t, h8_tag_id_str)
from typing import Dict, Iterator, Any, IO

try:
Expand Down Expand Up @@ -135,28 +136,6 @@ def Print(self, s):
self.f.write(s)


# HTML Tokens
# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
)


class Tok(object):
"""
Avoid lint errors by using these aliases
"""
pass


TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]

this_module = sys.modules[__name__]
for i, tok_str in enumerate(TOKENS):
setattr(this_module, tok_str, i)
setattr(Tok, tok_str, i)
TOKEN_NAMES[i] = tok_str


def MakeLexer(rules):
return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]

Expand Down Expand Up @@ -424,7 +403,7 @@ def _Tokens(s, left_pos, right_pos):


def ValidTokens(s, left_pos=0, right_pos=-1):
# type: (str, int, int) -> Iterator[Tuple[int, int]]
# type: (str, int, int) -> Iterator[Tuple[h8_id_t, int]]
"""Wrapper around _Tokens to prevent callers from having to handle Invalid.
I'm not combining the two functions because I might want to do a
Expand Down Expand Up @@ -509,8 +488,6 @@ def ValidTokenList(s, no_special_tags=False):
)?
''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)

TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)


class TagLexer(object):
"""
Expand Down Expand Up @@ -560,13 +537,14 @@ def GetSpanForAttrValue(self, attr_name):
try:
while True:
tok_id, start, end = next(events)
if tok_id == AttrName:
if tok_id == h8_tag_id.AttrName:
name = self.s[start:end]
if name == attr_name:
# The value should come next
tok_id, start, end = next(events)
assert tok_id in (QuotedValue, UnquotedValue,
MissingValue), h8_id_str(tok_id)
assert tok_id in (
h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
val = start, end
break

Expand Down Expand Up @@ -594,13 +572,14 @@ def AllAttrsRawSlice(self):
try:
while True:
tok_id, start, end = next(events)
if tok_id == AttrName:
if tok_id == h8_tag_id.AttrName:
name = self.s[start:end]

# The value should come next
tok_id, start, end = next(events)
assert tok_id in (QuotedValue, UnquotedValue,
MissingValue), h8_id_str(tok_id)
assert tok_id in (
h8_tag_id.QuotedValue, h8_tag_id.UnquotedValue,
h8_tag_id.MissingValue), h8_tag_id_str(tok_id)
# Note: quoted values may have &
# We would need ANOTHER lexer to unescape them, but we
# don't need that for ul-table
Expand All @@ -624,7 +603,7 @@ def AllAttrsRaw(self):
return pairs

def Tokens(self):
# type: () -> Iterator[Tuple[int, int, int]]
# type: () -> Iterator[Tuple[h8_tag_id_t, int, int]]
"""
Yields a sequence of tokens: Tag (AttrName AttrValue?)*
Expand All @@ -637,7 +616,7 @@ def Tokens(self):
if not m:
raise RuntimeError("Couldn't find HTML tag in %r" %
self.TagString())
yield TagName, m.start(1), m.end(1)
yield h8_tag_id.TagName, m.start(1), m.end(1)

pos = m.end(0)
#log('POS %d', pos)
Expand All @@ -650,21 +629,21 @@ def Tokens(self):
break
#log('AttrName %r', m.group(1))

yield AttrName, m.start(1), m.end(1)
yield h8_tag_id.AttrName, m.start(1), m.end(1)

#log('m.groups() %r', m.groups())
if m.group(2) is not None:
# double quoted
yield QuotedValue, m.start(2), m.end(2)
yield h8_tag_id.QuotedValue, m.start(2), m.end(2)
elif m.group(3) is not None:
# single quoted - TODO: could have different token types
yield QuotedValue, m.start(3), m.end(3)
yield h8_tag_id.QuotedValue, m.start(3), m.end(3)
elif m.group(4) is not None:
yield UnquotedValue, m.start(4), m.end(4)
yield h8_tag_id.UnquotedValue, m.start(4), m.end(4)
else:
# <button disabled>
end = m.end(0)
yield MissingValue, end, end
yield h8_tag_id.MissingValue, end, end

# Skip past the "
pos = m.end(0)
Expand Down
12 changes: 0 additions & 12 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,6 @@ def testCommentParse(self):

def testCommentParse2(self):
# type: () -> None

Tok = html.Tok
h = '''
hi <!-- line 1
line 2 --><br/>'''
Expand All @@ -230,7 +228,6 @@ def testCommentParse2(self):
def testProcessingInstruction(self):
# type: () -> None
# <?xml ?> header
Tok = html.Tok
h = 'hi <? err ?>'
tokens = Lex(h)

Expand All @@ -244,7 +241,6 @@ def testProcessingInstruction(self):

def testScriptStyle(self):
# type: () -> None
Tok = html.Tok
h = '''
hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
</script>
Expand All @@ -267,7 +263,6 @@ def testScriptStyle(self):

def testScriptStyleXml(self):
# type: () -> None
Tok = html.Tok
h = 'hi <script src=""> &lt; </script>'
# XML mode
tokens = Lex(h, no_special_tags=True)
Expand All @@ -286,7 +281,6 @@ def testScriptStyleXml(self):

def testCData(self):
# type: () -> None
Tok = html.Tok

# from
# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
Expand All @@ -302,7 +296,6 @@ def testCData(self):

def testEntity(self):
# type: () -> None
Tok = html.Tok

# from
# /home/andy/src/Python-3.12.4/Lib/test/xmltestdata/c14n-20/inC14N5.xml
Expand All @@ -320,7 +313,6 @@ def testEntity(self):

def testStartTag(self):
# type: () -> None
Tok = html.Tok

h = '<a>hi</a>'
tokens = Lex(h)
Expand Down Expand Up @@ -364,8 +356,6 @@ def testStartTag(self):

def testBad(self):
# type: () -> None
Tok = html.Tok

h = '&'
tokens = Lex(h)

Expand All @@ -384,8 +374,6 @@ def testBad(self):

def testInvalid(self):
# type: () -> None
Tok = html.Tok

for s in INVALID_LEX:
try:
tokens = html.ValidTokenList(s)
Expand Down

0 comments on commit 31ae20f

Please sign in to comment.