Skip to content

Commit

Permalink
[lazylex] Working on HTM8 lexer
Browse files Browse the repository at this point in the history
Found two workarounds to get rid of re.DOTALL:

    (?:.|\n)
    [\s\S]

But I also want to get rid of the nongreedy operator *?

So we will probably add another primitive: find substring.

I think our re2c code gen can be modified to handle that too.
  • Loading branch information
Andy C committed Jan 5, 2025
1 parent a40c0eb commit 10ec7ae
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 10 deletions.
8 changes: 4 additions & 4 deletions data_lang/TEST.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
# Usage:
# data_lang/TEST.sh <function name>

set -o nounset
set -o pipefail
set -o errexit
: ${LIB_OSH=stdlib/osh}
source $LIB_OSH/bash-strict.sh
source $LIB_OSH/task-five.sh

REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)

Expand All @@ -32,4 +32,4 @@ unit() {
done
}

"$@"
task-five "$@"
38 changes: 38 additions & 0 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash
#
# Usage:
# data_lang/htm8-test.sh

: ${LIB_OSH=stdlib/osh}
source $LIB_OSH/bash-strict.sh
source $LIB_OSH/task-five.sh

# parse with lazylex/html.py, or data_lang/htm8.py

site-files() {
find ../../oilshell/oilshell.org__deploy -name '*.html'
}

# Issues with lazylex/html.py
#
# - Token ID is annoying to express in Python
# - re.DOTALL for newlines
# - can we change that with [.\n]*?
# - nongreedy match for --> and ?>


test-site() {
# 1.5 M lines of HTML - takes 3 xargs invocations!
#
# TODO:
# - test that it lexes
# - test that tags are balanced

site-files | xargs wc -l
}

test-wwz() {
echo 'TODO: download .wwz from CI'
}

task-five "$@"
17 changes: 11 additions & 6 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,10 +107,7 @@ def TokenName(tok_id):


def MakeLexer(rules):
return [
# DOTALL is for the comment
(re.compile(pat, re.VERBOSE | re.DOTALL), i) for (pat, i) in rules
]
return [(re.compile(pat, re.VERBOSE), i) for (pat, i) in rules]


#
Expand Down Expand Up @@ -152,8 +149,16 @@ def MakeLexer(rules):
# https://news.ycombinator.com/item?id=27099798
#
# Maybe try combining all of these for speed.
(r'<!-- .*? -->', Tok.Comment),
(r'<\? .*? \?>', Tok.Processing),

# . is any char except newline
# https://re2c.org/manual/manual_c.html

# Hack from Claude: \s\S instead of re.DOTALL. I don't like this
#(r'<!-- [\s\S]*? -->', Tok.Comment),
(r'<!-- (?:.|[\n])*? -->', Tok.Comment),

#(r'<!-- .*? -->', Tok.Comment),
(r'<\? (?:.|\n)*? \?>', Tok.Processing),

# NOTE: < is allowed in these.
(r'<! [^>]+ >', Tok.Decl), # <!DOCTYPE html>
Expand Down
46 changes: 46 additions & 0 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,28 @@ def _PrintTokens(lex):
log('%s %r', tok, lex.s[start:end])


class RegexTest(unittest.TestCase):

def testDotAll(self):
import re

# Note that $ matches end of line, not end of string
p1 = re.compile(r'.')
print(p1.match('\n'))

p2 = re.compile(r'.', re.DOTALL)
print(p2.match('\n'))

#p3 = re.compile(r'[.\n]', re.VERBOSE)
p3 = re.compile(r'[.\n]')
print(p3.match('\n'))

print('Negation')

p4 = re.compile(r'[^>]')
print(p4.match('\n'))


class TagLexerTest(unittest.TestCase):

def testTagLexer(self):
Expand Down Expand Up @@ -94,6 +116,30 @@ def testCommentParse(self):
raise RuntimeError()
print(tok_id)

Tok = html.Tok
h = '''
hi <!-- line 1
line 2 --><br/>'''
print(repr(h))
lex = html.ValidTokens(h)

tok_id, pos = next(lex)
self.assertEqual(12, pos)
self.assertEqual(Tok.RawData, tok_id)

tok_id, pos = next(lex)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(50, pos)
self.assertEqual(Tok.Comment, tok_id)

tok_id, pos = next(lex)
self.assertEqual(55, pos)
self.assertEqual(Tok.StartEndTag, tok_id)

tok_id, pos = next(lex)
self.assertEqual(55, pos)
self.assertEqual(Tok.EndOfStream, tok_id)

def testValid(self):
Tok = html.Tok

Expand Down

0 comments on commit 10ec7ae

Please sign in to comment.