Skip to content

Commit

Permalink
[lazylex/html] Do more validation
Browse files Browse the repository at this point in the history
- We can parse attributes
- We still need another layer of validation on quoted attr values
- Started balanced tags check - many errors here in HTML, but not XML!
  • Loading branch information
Andy C committed Jan 10, 2025
1 parent adf80da commit 5ba4f20
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 5 deletions.
2 changes: 1 addition & 1 deletion data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ tree-wwz() {
test-wwz() {
pushd $WWZ_DIR

find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
time find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed

popd
}
Expand Down
53 changes: 49 additions & 4 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,28 +594,73 @@ def ToText(s, left_pos=0, right_pos=-1):
def main(argv):
action = argv[1]

if action == 'well-formed':
if action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
num_tokens = 0
num_start_tags = 0
num_start_end_tags = 0
num_attrs = 0

errors = []
i = 0
for line in sys.stdin:
name = line.strip()
with open(name) as f:
contents = f.read()

tag_lexer = TagLexer(contents)
lx = ValidTokens(contents)
tokens = []
start_pos = 0
tag_stack = []
try:
tokens = list(lx)
for tok_id, end_pos in lx:
tokens.append((tok_id, end_pos))
if tok_id == Tok.StartEndTag:
num_start_end_tags += 1
if action in ('lex-attrs', 'lex-attr-values',
'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
num_attrs += len(all_attrs)
elif tok_id == Tok.StartTag:
num_start_tags += 1
if action in ('lex-attrs', 'lex-attr-values',
'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()

# TODO: we need to get the tag name here
tag_stack.append('TODO')
elif tok_id == Tok.EndTag:
try:
expected = tag_stack.pop()
except IndexError:
raise ParseError('Tag stack empty')

# TODO: we need to get the tag name here
actual = 'TODO'
if expected != actual:
raise ParseError(
'Expected closing tag %r, got %r' %
(expected, actual))

start_pos = end_pos
except LexError as e:
log('Error in %r: %s', name, e)
log('Lex error in %r: %s', name, e)
errors.append((name, e))
except ParseError as e:
log('Parse error in %r: %s', name, e)
errors.append((name, e))
else:
num_tokens += len(tokens)

#print('%d %s' % (len(tokens), name))
i += 1

log('')
log(' %d tokens in %d files', num_tokens, i)
log(
' %d tokens, %d start/end tags, %d start tags, %d attrs in %d files',
num_tokens, num_start_end_tags, num_start_tags, num_attrs, i)
log(' %d errors', len(errors))
if 0:
for name, e in errors:
Expand Down
2 changes: 2 additions & 0 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,8 @@ def testInvalid(self):
INVALID = [
# Should be &
'<a>&',
'&amp', # not finished
'&#', # not finished
# Hm > is allowed?
#'a > b',
'a < b',
Expand Down

0 comments on commit 5ba4f20

Please sign in to comment.