Skip to content

Commit

Permalink
[data_lang] Add test for HTM8 (aka HT8 )
Browse files Browse the repository at this point in the history
I might want to call it HT8 to reduce confusion.  HTM is a common file
extension.

I learned about rules for:

- <script> <style> <textarea>
- CDATA vs RCDATA

I think we may just skip those altogether.  Well we can have special
lexing rules to treat them as opaque text.  If we find those, then we
just search for the ending </script> or </style>.
  • Loading branch information
Andy C committed Jan 6, 2025
1 parent 549fa1a commit 57b0b18
Show file tree
Hide file tree
Showing 2 changed files with 116 additions and 6 deletions.
86 changes: 80 additions & 6 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,20 @@
# Usage:
# data_lang/htm8-test.sh

: ${LIB_OSH=stdlib/osh}
REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)

# Special case: we need $REPO_ROOT
: ${LIB_OSH=$REPO_ROOT/stdlib/osh}
source $LIB_OSH/bash-strict.sh
source $LIB_OSH/task-five.sh

# parse with lazylex/html.py, or data_lang/htm8.py

site-files() {
find ../../oilshell/oilshell.org__deploy -name '*.html'
#find ../../oilshell/oilshell.org__deploy -name '*.html'

# omit all the _ files
git ls-files | grep '\.html$'
}

# Issues with lazylex/html.py
Expand All @@ -20,19 +26,87 @@ site-files() {
# - can we change that with [.\n]*?
# - nongreedy match for --> and ?>

ht8-tool() {
PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \
$REPO_ROOT/lazylex/html.py "$@"
}

test-well-formed() {
cat >_tmp/bad.html <<EOF
hi && bye
EOF
echo '_tmp/bad.html' | ht8-tool well-formed
}

# site errors
#
# Error in 'release/0.7.pre5/doc/osh-quick-ref.html': (LexError '&& or ||</h4>\n<!-- 2')
# Error in 'src/data/symbol.html': (LexError "&& mangle[0]!='E' &&")
# 5833374 tokens in 4710 files
#
# The second is the "Woboq" browser, which has CDATA
# Ah I wonder if we need that.

# Takes ~13 seconds
test-site() {
# 1.5 M lines of HTML - takes 3 xargs invocations!
#
# TODO:
# - test that it lexes
# - test that the top level lexes
# - test that each tag lexers
# - test that each quoted attribute lexes
# - test that tags are balanced

site-files | xargs wc -l
pushd ../../oilshell/oilshell.org__deploy

# Too many files
# site-files | xargs wc -l | grep total

# Not using xargs
time site-files | $REPO_ROOT/$0 ht8-tool well-formed

popd
}

test-wwz() {
echo 'TODO: download .wwz from CI'
}

task-five "$@"
exit


echo '
In HTML5, instead of
<script>
<![CDATA[
if (x < y) { ... }
]]>
</script>
You can write
<script>
if (x < y) { ... }
</script>
<script> <style> <textarea>
These have special escaping rules. I guess we just do NOT lex them at all?
We can totally SKIP them.
CDATA vs. RCDATA
<textarea>
&lt;p&gt; <!-- This will show as: <p> -->
&amp; <!-- This will show as: & -->
</textarea>
<script>
&lt;p&gt; <!-- This will show literally as: &lt;p&gt; -->
&amp; <!-- This will show literally as: &amp; -->
</script>
The main practical difference is that RCDATA processes HTML entities while
CDATA treats them as literal text. Both modes ignore HTML tags (treating them
as plain text) except for their own closing tag. '
'
36 changes: 36 additions & 0 deletions lazylex/html.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,39 @@ def ToText(s, left_pos=0, right_pos=-1):

out.PrintTheRest()
return f.getvalue()


def main(argv):
action = argv[1]

if action == 'well-formed':
num_tokens = 0
errors = []
i = 0
for line in sys.stdin:
name = line.strip()
with open(name) as f:
contents = f.read()

lx = ValidTokens(contents)
try:
tokens = list(lx)
except LexError as e:
log('Error in %r: %s', name, e)
errors.append((name, e))
else:
num_tokens += len(tokens)
#print('%d %s' % (len(tokens), name))
i += 1

log('%d tokens in %d files', num_tokens, i)
if 0:
for name, e in errors:
log('Error in %r: %s', name, e)

else:
raise RuntimeError('Invalid action %r' % action)


if __name__ == '__main__':
main(sys.argv)

0 comments on commit 57b0b18

Please sign in to comment.