Skip to content

Commit

Permalink
[lazylex/html] Options for lexing/parsing
Browse files Browse the repository at this point in the history
- NO_SPECIAL_TAGS: this is XML mode, basically
- BALANCED_TAGS: we can skip the check for balanced tags
  - for old HTML with bugs, which I fixed at HEAD
  • Loading branch information
Andy C committed Jan 11, 2025
1 parent 708b785 commit eeecaf6
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 69 deletions.
57 changes: 36 additions & 21 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,42 +4,53 @@
# data_lang/htm8-test.sh
#
# TODO:
# - Refactor Validate(): take FLAGS, return stats optionally
# - add LEX_QUOTED_VALUES
# - and then re-run all the tests
# - Rename to data_lang/htm8.py
# - it has NO_SPECIAL_TAGS mode for XML
#
# - Soil
# - Validate all the HTML in the repo - well-formed check
# - this should go in the CI
# - Automate some more tests:
# - Validate()
# - add LEX_QUOTED_VALUES, along with counter for it
# - and then re-run all the tests - make sure they pass
# - site oils.pub, site oilshell.org
# - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
# - Rename to data_lang/htm8.py
# - it has NO_SPECIAL_TAGS mode for XML
# - put iterators at a higher level in doctools/ ?
#
# - statically type it
# - revive pyannotate
# - translate to C++
# - what to do about all the regexes? Port to re2c directly?
# - how to handle the regexes in the lexer? Port to re2c directly?
# - for find(), do we need a C++ primitive for it?
# - no allocation for TagName()
# - ASDL file for Tok.Foo?
# - refactor TagName() API - remove it from the TagLexer?
# - that is really the AttrLexer()
#
# - build a DOM with objects in YSH?
# - rewrite ul-table in that?
# Not working yet:
# - understanding all entities &zz;
# - there are over 2000 of them, not sure I want to build them all into the Oils binaries
# - capital letters <TR/> - I guess we can normalize the case
#
# YSH API
# - Generating HTML/HTM8 is much more common than parsing it
# - although maybe we can do RemoveComments as a demo?
# - that is the lowest level "sed" model
# - For parsing, a minimum idea is:
# - lexer-based algorithms for query by tag, class name, and id
# - and then toTree()
# - and then toTree() - this is a DOM
# - .tag and .attrs?
# - .innerHTML() and .outerHTML() perhaps
# - and maybe you can mutate it directly
# - rewrite ul-table in that?
# - does that mean you mutate it, or construct text?
# - I think you can set the innerHTML probably
#
# - Testing of html.ysh aka htm8.ysh in the stdlib
#
# Cases:
# html 'hello <b>world</b>'
# html "hello <b>$name</b>"html
# html ["hello <b>$name</b>"] # hm this isn't bad, it's an unevaluated expression?
# commonmark 'hello **world**'
# md 'hello **world**'
# md ['hello **$escape**'] ? We don't have a good escaping algorithm


REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)

Expand Down Expand Up @@ -96,10 +107,14 @@ test-site() {
# - test that each quoted attribute lexes
# - test that tags are balanced

local dir
local action
if test -n "$new_site"; then
dir='../oils.pub__deploy'
action='parse-htm8'
else
dir='../../oilshell/oilshell.org__deploy'
action='lex-htm8'
fi

pushd $dir
Expand All @@ -108,7 +123,7 @@ test-site() {
# site-files | xargs wc -l | grep total

# Not using xargs
time site-files | $REPO_ROOT/$0 htm8-tool validate
time site-files | $REPO_ROOT/$0 htm8-tool $action

popd
}
Expand Down Expand Up @@ -145,7 +160,7 @@ tree-wwz() {
test-wwz() {
pushd $WWZ_DIR

time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8

popd
}
Expand All @@ -157,21 +172,21 @@ find-xml() {
test-other-xml() {
# problem with &ent1;
# CDATA support! haha OK
time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool validate
time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
}

test-repo-xml() {
# OK these parse
time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
| $REPO_ROOT/$0 htm8-tool validate
| $REPO_ROOT/$0 htm8-tool parse-xml
}

test-repo-html() {
time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
}

test-docs() {
time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool parse-htm8
}

soil-run() {
Expand Down
106 changes: 68 additions & 38 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,10 +267,12 @@ def MakeLexer(rules):

class Lexer(object):

def __init__(self, s, left_pos=0, right_pos=-1):
def __init__(self, s, left_pos=0, right_pos=-1, no_special_tags=False):
self.s = s
self.pos = left_pos
self.right_pos = len(s) if right_pos == -1 else right_pos
self.no_special_tags = no_special_tags

self.cache = {} # string -> compiled regex pattern object

# either </script> or </style> - we search until we see that
Expand All @@ -292,7 +294,7 @@ def _Peek(self):

assert self.pos < self.right_pos, self.pos

if self.search_state is not None:
if self.search_state is not None and not self.no_special_tags:
pos = self.s.find(self.search_state, self.pos)
if pos == -1:
# unterminated <script> or <style>
Expand Down Expand Up @@ -353,13 +355,15 @@ def TagNameEquals(self, expected):
assert self.tag_pos_right != -1, self.tag_pos_right

# TODO: In C++, this does not need an allocation
# TODO: conditionally lower() case here (maybe not in XML mode)
return expected == self.s[self.tag_pos_left:self.tag_pos_right]

def TagName(self):
# type: () -> None
assert self.tag_pos_left != -1, self.tag_pos_left
assert self.tag_pos_right != -1, self.tag_pos_right

# TODO: conditionally lower() case here (maybe not in XML mode)
return self.s[self.tag_pos_left:self.tag_pos_right]

def Read(self):
Expand Down Expand Up @@ -409,6 +413,23 @@ def ValidTokens(s, left_pos=0, right_pos=-1):
pos = end_pos


def ValidTokenList(s, no_special_tags=False):
"""A wrapper that can be more easily translated to C++. Doesn't use iterators."""

start_pos = 0
tokens = []
lx = Lexer(s, no_special_tags=no_special_tags)
while True:
tok_id, end_pos = lx.Read()
tokens.append((tok_id, end_pos))
if tok_id == Tok.EndOfStream:
break
if tok_id == Tok.Invalid:
raise LexError(s, start_pos)
start_pos = end_pos
return tokens


# Tag names:
# Match <a or </a
# Match <h2, but not <2h
Expand Down Expand Up @@ -695,16 +716,15 @@ def ToText(s, left_pos=0, right_pos=-1):
LEX_ATTRS = 1 << 1
LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
CHECK_TAGS = 1 << 4 # balancing tags
BALANCED_TAGS = 1 << 4 # are tags balanced?


def Validate(contents, flags, counters=None):
# type: (str, int, Optional[Dict[str, int]]) -> None

action = 'well-formed'
def Validate(contents, flags, counters):
# type: (str, int, Counters) -> None

tag_lexer = TagLexer(contents)
lx = Lexer(contents)
no_special_tags = bool(flags & NO_SPECIAL_TAGS)
lx = Lexer(contents, no_special_tags=no_special_tags)
tokens = []
start_pos = 0
tag_stack = []
Expand All @@ -720,39 +740,45 @@ def Validate(contents, flags, counters=None):

if tok_id == Tok.StartEndTag:
counters.num_start_end_tags += 1
if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
counters.num_attrs += len(all_attrs)

tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
counters.num_attrs += len(all_attrs)

elif tok_id == Tok.StartTag:
counters.num_start_tags += 1
if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
counters.num_attrs += len(all_attrs)

tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
counters.num_attrs += len(all_attrs)

if flags & BALANCED_TAGS:
tag_name = lx.TagName()
# Don't bother to check
if tag_name not in VOID_ELEMENTS:
if flags & NO_SPECIAL_TAGS:
tag_stack.append(tag_name)
else:
# e.g. <meta> is considered self-closing, like <meta/>
if tag_name not in VOID_ELEMENTS:
tag_stack.append(tag_name)

counters.max_tag_stack = max(counters.max_tag_stack,
len(tag_stack))
counters.max_tag_stack = max(counters.max_tag_stack,
len(tag_stack))
elif tok_id == Tok.EndTag:
try:
expected = tag_stack.pop()
except IndexError:
raise ParseError('Tag stack empty',
s=contents,
start_pos=start_pos)

actual = lx.TagName()
if expected != actual:
raise ParseError(
'Got unexpected closing tag %r; opening tag was %r' %
(contents[start_pos:end_pos], expected),
s=contents,
start_pos=start_pos)
if flags & BALANCED_TAGS:
try:
expected = tag_stack.pop()
except IndexError:
raise ParseError('Tag stack empty',
s=contents,
start_pos=start_pos)

actual = lx.TagName()
if expected != actual:
raise ParseError(
'Got unexpected closing tag %r; opening tag was %r' %
(contents[start_pos:end_pos], expected),
s=contents,
start_pos=start_pos)

start_pos = end_pos
counters.num_tokens += len(tokens)
Expand Down Expand Up @@ -789,21 +815,25 @@ def main(argv):

return 0

elif action == 'validate':
elif action in ('lex-htm8', 'parse-htm8', 'parse-xml'):

errors = []
counters = Counters()

flags = LEX_ATTRS | LEX_QUOTED_VALUES
if action.startswith('parse-'):
flags |= BALANCED_TAGS
if action == 'parse-xml':
flags |= NO_SPECIAL_TAGS

i = 0
for line in sys.stdin:
filename = line.strip()
with open(filename) as f:
contents = f.read()

# TODO: xml version with NO_SPECIAL_TAGS
try:
Validate(contents, LEX_ATTRS | LEX_QUOTED_VALUES | CHECK_TAGS,
counters)
Validate(contents, flags, counters)
except LexError as e:
log('Lex error in %r: %s', filename, e)
errors.append((filename, e))
Expand Down
32 changes: 22 additions & 10 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,9 @@ def testAllAttrs(self):
self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())


def Lex(h):
def Lex(h, no_special_tags=False):
print(repr(h))
lex = html.ValidTokens(h)
tokens = list(lex)
tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
start_pos = 0
for tok_id, end_pos in tokens:
frag = h[start_pos:end_pos]
Expand All @@ -132,10 +131,7 @@ def testPstrip(self):

def testCommentParse(self):
n = len(TEST_HTML)
for tok_id, end_pos in html._Tokens(TEST_HTML, 0, n):
if tok_id == html.Invalid:
raise RuntimeError()
print(tok_id)
tokens = Lex(TEST_HTML)

def testCommentParse2(self):

Expand Down Expand Up @@ -187,6 +183,24 @@ def testScriptStyle(self):
],
tokens)

def testScriptStyleXml(self):
Tok = html.Tok
h = 'hi <script src=""> &lt; </script>'
# XML mode
tokens = Lex(h, no_special_tags=True)

self.assertEqual(
[
(Tok.RawData, 3),
(Tok.StartTag, 18), # <script>
(Tok.RawData, 19), # space
(Tok.CharEntity, 23), # </script>
(Tok.RawData, 24), # \n
(Tok.EndTag, 33), # \n
(Tok.EndOfStream, 33), # \n
],
tokens)

def testCData(self):
Tok = html.Tok

Expand Down Expand Up @@ -279,10 +293,8 @@ def testInvalid(self):
]

for s in INVALID:
lex = html.ValidTokens(s)
try:
for i in xrange(5):
tok_id, pos = next(lex)
tokens = html.ValidTokenList(s)
except html.LexError as e:
print(e)
else:
Expand Down

0 comments on commit eeecaf6

Please sign in to comment.