From 708b7850ffc3dad254c94cf91b33ba3e5f6a4f47 Mon Sep 17 00:00:00 2001 From: Andy C Date: Fri, 10 Jan 2025 19:30:43 -0500 Subject: [PATCH] [lazylex/html] Validate our HTML in Soil All errors have been fixed! Refactor into Validate() function. Still needs more polish. --- data_lang/htm8-test.sh | 72 +++++++++++----- lazylex/html.py | 190 ++++++++++++++++++++++++----------------- soil/web-init.sh | 2 +- soil/worker.sh | 1 + 4 files changed, 167 insertions(+), 98 deletions(-) diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh index 78fd0c409..076bdebb7 100755 --- a/data_lang/htm8-test.sh +++ b/data_lang/htm8-test.sh @@ -4,20 +4,42 @@ # data_lang/htm8-test.sh # # TODO: -# - Rename to DML8? Because it can handle XML -# - CDATA in XML, which is not a script +# - Refactor Validate(): take FLAGS, return stats optionally +# - add LEX_QUOTED_VALUES +# - and then re-run all the tests +# - Rename to data_lang/htm8.py +# - it has NO_SPECIAL_TAGS mode for XML # -# Operations / Levels: +# - Soil +# - Validate all the HTML in the repo - well-formed check +# - this should go in the CI +# - Automate some more tests: +# - site oils.pub, site oilshell.org +# - XML on my machine - turn that in to 'WILD' corpus for HTML/XML? # -# - Lexing -# - lex-tags -# - lex-attrs - validate all Start tags, all StartEnd tags -# - lex-quoted-values - unescaping, etc. -# - are there invalid entities? -# - Parsing -# - well-formed / tag balance check -# - Schema -# - not sure if we check the HTML schema or not - it might be too restrictive +# - statically type it +# - revive pyannotate +# - translate to C++ +# - what to do about all the regexes? Port to re2c directly? +# - for find(), do we need a C++ primitive for it? +# - no allocation for TagName() +# - ASDL file for Tok.Foo? +# - refactor TagName() API - remove it from the TagLexer? +# - that is really the AttrLexer() +# +# - build a DOM with objects in YSH? +# - rewrite ul-table in that? +# +# YSH API +# - Generating HTML/HTM8 is much more common than parsing it +# - although maybe we can do RemoveComments as a demo? +# - that is the lowest level "sed" model +# - For parsing, a minimum idea is: +# - lexer-based algorithms for query by tag, class name, and id +# - and then toTree() +# - .tag and .attrs? +# - .innerHTML() and .outerHTML() perhaps +# - and maybe you can mutate it directly REPO_ROOT=$(cd "$(dirname $0)/.."; pwd) @@ -42,7 +64,7 @@ site-files() { # - can we change that with [.\n]*? # - nongreedy match for --> and ?> -ht8-tool() { +htm8-tool() { PYTHONPATH="$REPO_ROOT:$REPO_ROOT/vendor" \ $REPO_ROOT/lazylex/html.py "$@" } @@ -52,7 +74,7 @@ test-well-formed() { unfinished