From 57b0b186a91f4ac6802ec45bd90be137cbf26375 Mon Sep 17 00:00:00 2001 From: Andy C Date: Mon, 6 Jan 2025 14:39:55 -0500 Subject: [PATCH] [data_lang] Add test for HTM8 (aka HT8 ) I might want to call it HT8 to reduce confusion. HTM is a common file extension. I learned about rules for: - or . --- data_lang/htm8-test.sh | 86 +++++++++++++++++++++++++++++++++++++++--- lazylex/html.py | 36 ++++++++++++++++++ 2 files changed, 116 insertions(+), 6 deletions(-) mode change 100644 => 100755 lazylex/html.py diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh index 16fee7de7..6c3c72367 100755 --- a/data_lang/htm8-test.sh +++ b/data_lang/htm8-test.sh @@ -3,14 +3,20 @@ # Usage: # data_lang/htm8-test.sh -: ${LIB_OSH=stdlib/osh} +REPO_ROOT=$(cd "$(dirname $0)/.."; pwd) + +# Special case: we need $REPO_ROOT +: ${LIB_OSH=$REPO_ROOT/stdlib/osh} source $LIB_OSH/bash-strict.sh source $LIB_OSH/task-five.sh # parse with lazylex/html.py, or data_lang/htm8.py site-files() { - find ../../oilshell/oilshell.org__deploy -name '*.html' + #find ../../oilshell/oilshell.org__deploy -name '*.html' + + # omit all the _ files + git ls-files | grep '\.html$' } # Issues with lazylex/html.py @@ -20,15 +26,44 @@ site-files() { # - can we change that with [.\n]*? # - nongreedy match for --> and ?> +ht8-tool() { + PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \ + $REPO_ROOT/lazylex/html.py "$@" +} + +test-well-formed() { + cat >_tmp/bad.html <\n + & + + + + +The main practical difference is that RCDATA processes HTML entities while +CDATA treats them as literal text. Both modes ignore HTML tags (treating them +as plain text) except for their own closing tag. ' +' + diff --git a/lazylex/html.py b/lazylex/html.py old mode 100644 new mode 100755 index 03cab4355..f77311267 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -515,3 +515,39 @@ def ToText(s, left_pos=0, right_pos=-1): out.PrintTheRest() return f.getvalue() + + +def main(argv): + action = argv[1] + + if action == 'well-formed': + num_tokens = 0 + errors = [] + i = 0 + for line in sys.stdin: + name = line.strip() + with open(name) as f: + contents = f.read() + + lx = ValidTokens(contents) + try: + tokens = list(lx) + except LexError as e: + log('Error in %r: %s', name, e) + errors.append((name, e)) + else: + num_tokens += len(tokens) + #print('%d %s' % (len(tokens), name)) + i += 1 + + log('%d tokens in %d files', num_tokens, i) + if 0: + for name, e in errors: + log('Error in %r: %s', name, e) + + else: + raise RuntimeError('Invalid action %r' % action) + + +if __name__ == '__main__': + main(sys.argv)