Skip to content

Commit

Permalink
[lazylex] Using faster mystr.find() instead of regex
Browse files Browse the repository at this point in the history
- No re.DOTALL
- No non-greedy match
  • Loading branch information
Andy C committed Jan 7, 2025
1 parent 445b0d9 commit 39ea432
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 12 deletions.
3 changes: 2 additions & 1 deletion data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ ht8-tool() {

test-well-formed() {
cat >_tmp/bad.html <<EOF
unfinished <!--
hi && bye
EOF
echo '_tmp/bad.html' | ht8-tool well-formed
Expand Down Expand Up @@ -94,7 +95,7 @@ tree-wwz() {
tree $WWZ_DIR
}

check-wwz() {
test-wwz() {
pushd $WWZ_DIR

find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
Expand Down
34 changes: 23 additions & 11 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ def Print(self, s):


# HTML Tokens
TOKENS = 'Decl Comment Processing StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
# CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
)


Expand All @@ -91,8 +92,6 @@ class Tok(object):
pass


assert len(TOKENS) == 12, TOKENS

TOKEN_NAMES = [None] * len(TOKENS) # type: List[str]

this_module = sys.modules[__name__]
Expand Down Expand Up @@ -140,10 +139,7 @@ def MakeLexer(rules):
# EntityRef = / '&' dot{* N} ';' /

LEXER = [
# TODO: instead of nongreedy matches, the loop can just do .find('-->') and
# .find('?>')

# Actually non-greedy matches are regular and can be matched in linear time
# Note non-greedy matches are regular and can be matched in linear time
# with RE2.
#
# https://news.ycombinator.com/item?id=27099798
Expand All @@ -153,11 +149,13 @@ def MakeLexer(rules):
# . is any char except newline
# https://re2c.org/manual/manual_c.html

# Discarded options
#(r'<!-- .*? -->', Tok.Comment),

# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
#(r'<!-- [\s\S]*? -->', Tok.Comment),
(r'<!-- (?:.|[\n])*? -->', Tok.Comment),

#(r'<!-- .*? -->', Tok.Comment),
#(r'<!-- (?:.|[\n])*? -->', Tok.Comment),
(r'<!--', Tok.CommentBegin),

# Processing instruction are XML only, but they are treated like a comment
# in HTML:
Expand All @@ -166,7 +164,7 @@ def MakeLexer(rules):
#
# We don't want to confuse them with start tags, so we recognize them at
# the top level.
(r'<\? (?:.|\n)*? \?>', Tok.Processing),
(r'<\?', Tok.ProcessingBegin),

# NOTE: < is allowed in these.
(r'<! [^>]+ >', Tok.Decl), # <!DOCTYPE html>
Expand Down Expand Up @@ -213,6 +211,20 @@ def _Peek(self):
for pat, tok_id in LEXER:
m = pat.match(self.s, self.pos)
if m:
if tok_id == Tok.CommentBegin:
pos = self.s.find('-->', self.pos)
if pos == -1:
# unterminated <!--
raise LexError(self.s, self.pos)
return Tok.Comment, pos + 3 # -->

if tok_id == Tok.ProcessingBegin:
pos = self.s.find('?>', self.pos)
if pos == -1:
# unterminated <?
raise LexError(self.s, self.pos)
return Tok.Processing, pos + 2 # ?>

return tok_id, m.end()
else:
raise AssertionError('Tok.Invalid rule should have matched')
Expand Down
20 changes: 20 additions & 0 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,26 @@ def testInvalid(self):
else:
self.fail('Expected LexError')

# Comment
lex = html.ValidTokens('<!-- unfinished comment')

try:
tok_id, pos = next(lex)
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')

# Processing
lex = html.ValidTokens('<? unfinished processing')

try:
tok_id, pos = next(lex)
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')


if __name__ == '__main__':
unittest.main()

0 comments on commit 39ea432

Please sign in to comment.