Skip to content

Commit

Permalink
[lazylex/html] Validate our HTML in Soil
Browse files Browse the repository at this point in the history
All errors have been fixed!

Refactor into Validate() function.  Still needs more polish.
  • Loading branch information
Andy C committed Jan 11, 2025
1 parent 72d3784 commit 708b785
Show file tree
Hide file tree
Showing 4 changed files with 167 additions and 98 deletions.
72 changes: 53 additions & 19 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,42 @@
# data_lang/htm8-test.sh
#
# TODO:
# - Rename to DML8? Because it can handle XML
# - CDATA in XML, which is not a script
# - Refactor Validate(): take FLAGS, return stats optionally
# - add LEX_QUOTED_VALUES
# - and then re-run all the tests
# - Rename to data_lang/htm8.py
# - it has NO_SPECIAL_TAGS mode for XML
#
# Operations / Levels:
# - Soil
# - Validate all the HTML in the repo - well-formed check
# - this should go in the CI
# - Automate some more tests:
# - site oils.pub, site oilshell.org
# - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
#
# - Lexing
# - lex-tags
# - lex-attrs - validate all Start tags, all StartEnd tags
# - lex-quoted-values - unescaping, etc.
# - are there invalid entities?
# - Parsing
# - well-formed / tag balance check
# - Schema
# - not sure if we check the HTML schema or not - it might be too restrictive
# - statically type it
# - revive pyannotate
# - translate to C++
# - what to do about all the regexes? Port to re2c directly?
# - for find(), do we need a C++ primitive for it?
# - no allocation for TagName()
# - ASDL file for Tok.Foo?
# - refactor TagName() API - remove it from the TagLexer?
# - that is really the AttrLexer()
#
# - build a DOM with objects in YSH?
# - rewrite ul-table in that?
#
# YSH API
# - Generating HTML/HTM8 is much more common than parsing it
# - although maybe we can do RemoveComments as a demo?
# - that is the lowest level "sed" model
# - For parsing, a minimum idea is:
# - lexer-based algorithms for query by tag, class name, and id
# - and then toTree()
# - .tag and .attrs?
# - .innerHTML() and .outerHTML() perhaps
# - and maybe you can mutate it directly

REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)

Expand All @@ -42,7 +64,7 @@ site-files() {
# - can we change that with [.\n]*?
# - nongreedy match for --> and ?>

ht8-tool() {
htm8-tool() {
PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
$REPO_ROOT/lazylex/html.py "$@"
}
Expand All @@ -52,7 +74,7 @@ test-well-formed() {
unfinished <!--
hi && bye
EOF
echo '_tmp/bad.html' | ht8-tool well-formed
echo '_tmp/bad.html' | htm8-tool well-formed
}

# site errors
Expand Down Expand Up @@ -86,12 +108,12 @@ test-site() {
# site-files | xargs wc -l | grep total

# Not using xargs
time site-files | $REPO_ROOT/$0 ht8-tool well-formed
time site-files | $REPO_ROOT/$0 htm8-tool validate

popd
}

readonly SOIL_ID=8915
readonly SOIL_ID=8917
readonly WWZ_DIR=_tmp/$SOIL_ID

sync-wwz() {
Expand Down Expand Up @@ -123,7 +145,7 @@ tree-wwz() {
test-wwz() {
pushd $WWZ_DIR

time find . -name '*.html' | $REPO_ROOT/$0 ht8-tool well-formed
time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate

popd
}
Expand All @@ -135,13 +157,25 @@ find-xml() {
test-other-xml() {
# problem with &ent1;
# CDATA support! haha OK
time cat _tmp/xml-files.txt | $REPO_ROOT/$0 ht8-tool well-formed
time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool validate
}

test-repo-xml() {
# OK these parse
time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
| $REPO_ROOT/$0 ht8-tool well-formed
| $REPO_ROOT/$0 htm8-tool validate
}

test-repo-html() {
time find . -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
}

test-docs() {
time find _release/VERSION -name '*.html' | $REPO_ROOT/$0 htm8-tool validate
}

soil-run() {
test-docs
}

# OK we have to skip the <script> tag! And <style>
Expand Down
190 changes: 112 additions & 78 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@
See lazylex/README.md for details.
TODO: This should be an Oils library eventually. It's a "lazily-parsed data
structure" like TSV8
Conflicts between HTML5 and XML:
- In XML, <source> is like any tag, and must be closed,
Expand All @@ -18,6 +15,12 @@
- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
TODO:
- Are there special rules for <svg> and <math>?
- Do we need to know about <textarea> <pre>? Those don't have the same
whitespace rules
"""
from __future__ import print_function

Expand All @@ -29,7 +32,7 @@
import sys

if sys.version_info.major == 2:
from typing import List, Tuple, Optional
from typing import List, Tuple, Optional, Dict


def log(msg, *args):
Expand Down Expand Up @@ -689,6 +692,81 @@ def ToText(s, left_pos=0, right_pos=-1):
'wbr',
]

LEX_ATTRS = 1 << 1
LEX_QUOTED_VALUES = 1 << 2 # href="?x=42&amp;y=99"
NO_SPECIAL_TAGS = 1 << 3 # <script> <style>, VOID tags, etc.
CHECK_TAGS = 1 << 4 # balancing tags


def Validate(contents, flags, counters=None):
# type: (str, int, Optional[Dict[str, int]]) -> None

action = 'well-formed'

tag_lexer = TagLexer(contents)
lx = Lexer(contents)
tokens = []
start_pos = 0
tag_stack = []
while True:
tok_id, end_pos = lx.Read()

if tok_id == Tok.Invalid:
raise LexError(contents, start_pos)
if tok_id == Tok.EndOfStream:
break

tokens.append((tok_id, end_pos))

if tok_id == Tok.StartEndTag:
counters.num_start_end_tags += 1
if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
counters.num_attrs += len(all_attrs)
elif tok_id == Tok.StartTag:
counters.num_start_tags += 1
if action in ('lex-attrs', 'lex-attr-values', 'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
counters.num_attrs += len(all_attrs)

tag_name = lx.TagName()
# Don't bother to check
if tag_name not in VOID_ELEMENTS:
tag_stack.append(tag_name)

counters.max_tag_stack = max(counters.max_tag_stack,
len(tag_stack))
elif tok_id == Tok.EndTag:
try:
expected = tag_stack.pop()
except IndexError:
raise ParseError('Tag stack empty',
s=contents,
start_pos=start_pos)

actual = lx.TagName()
if expected != actual:
raise ParseError(
'Got unexpected closing tag %r; opening tag was %r' %
(contents[start_pos:end_pos], expected),
s=contents,
start_pos=start_pos)

start_pos = end_pos
counters.num_tokens += len(tokens)


class Counters(object):

def __init__(self):
self.num_tokens = 0
self.num_start_tags = 0
self.num_start_end_tags = 0
self.num_attrs = 0
self.max_tag_stack = 0


def main(argv):
action = argv[1]
Expand All @@ -711,98 +789,54 @@ def main(argv):

return 0

elif action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
num_tokens = 0
num_start_tags = 0
num_start_end_tags = 0
num_attrs = 0
max_tag_stack = 0
elif action == 'validate':

errors = []
counters = Counters()

i = 0
for line in sys.stdin:
name = line.strip()
with open(name) as f:
filename = line.strip()
with open(filename) as f:
contents = f.read()

tag_lexer = TagLexer(contents)
lx = Lexer(contents)
tokens = []
start_pos = 0
tag_stack = []
# TODO: xml version with NO_SPECIAL_TAGS
try:
while True:
tok_id, end_pos = lx.Read()

if tok_id == Tok.Invalid:
raise LexError(contents, start_pos)
if tok_id == Tok.EndOfStream:
break

tokens.append((tok_id, end_pos))

if tok_id == Tok.StartEndTag:
num_start_end_tags += 1
if action in ('lex-attrs', 'lex-attr-values',
'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
num_attrs += len(all_attrs)
elif tok_id == Tok.StartTag:
num_start_tags += 1
if action in ('lex-attrs', 'lex-attr-values',
'well-formed'):
tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()

tag_name = lx.TagName()
# Don't bother to check
if tag_name not in VOID_ELEMENTS:
tag_stack.append(tag_name)

max_tag_stack = max(max_tag_stack, len(tag_stack))
elif tok_id == Tok.EndTag:
try:
expected = tag_stack.pop()
except IndexError:
raise ParseError('Tag stack empty',
s=contents,
start_pos=start_pos)

actual = lx.TagName()
if expected != actual:
raise ParseError(
'Got unexpected closing tag %r; opening tag was %r'
% (contents[start_pos:end_pos], expected),
s=contents,
start_pos=start_pos)

start_pos = end_pos
Validate(contents, LEX_ATTRS | LEX_QUOTED_VALUES | CHECK_TAGS,
counters)
except LexError as e:
log('Lex error in %r: %s', name, e)
errors.append((name, e))
log('Lex error in %r: %s', filename, e)
errors.append((filename, e))
except ParseError as e:
log('Parse error in %r: %s', name, e)
errors.append((name, e))
else:
num_tokens += len(tokens)

#print('%d %s' % (len(tokens), name))
log('Parse error in %r: %s', filename, e)
errors.append((filename, e))
i += 1

log('')
log(
' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
num_tokens, num_start_end_tags, num_start_tags, num_attrs,
max_tag_stack, i)
counters.num_tokens, counters.num_start_end_tags,
counters.num_start_tags, counters.num_attrs,
counters.max_tag_stack, i)
log(' %d errors', len(errors))
if 0:
for name, e in errors:
log('Error in %r: %s', name, e)
if len(errors):
return 1
return 0

elif action == 'todo':
# Other algorithms:
#
# - select first subtree with given ID
# - this requires understanding the void tags I suppose
# - select all subtrees that have a class
# - materialize DOM

# Safe-HTM8? This is a filter
return 0

else:
raise RuntimeError('Invalid action %r' % action)


if __name__ == '__main__':
main(sys.argv)
sys.exit(main(sys.argv))
2 changes: 1 addition & 1 deletion soil/web-init.sh
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ deploy-data() {
soil-web-manifest() {
PYTHONPATH=. /usr/bin/env python2 \
build/dynamic_deps.py py-manifest soil.web \
| grep oilshell/oil # only stuff in the repo
| grep oils-for-unix/oils # only stuff in the repo

# Add a shell script
echo $PWD/soil/web.sh soil/web.sh
Expand Down
1 change: 1 addition & 0 deletions soil/worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,7 @@ osh-usage test/osh-usage.sh soil-run -
tools-deps test/tools-deps.sh soil-run -
docs build/doc.sh soil-run _release/VERSION/index.html
doc-metrics echo no-op _release/VERSION/doc/metrics.txt
check-docs data_lang/htm8-test.sh soil-run -
EOF
# doc-metrics is a no-op, just for the link. Because soil-run just runs the
# release, which creates metrics.
Expand Down

0 comments on commit 708b785

Please sign in to comment.