From 06efda8006e88f396193253570d7b307db5cd858 Mon Sep 17 00:00:00 2001 From: Andy C Date: Sat, 11 Jan 2025 11:55:39 -0500 Subject: [PATCH] [doctools/micro-syntax] Start HTM8 lexer. The motivation was to test some HTM8 rules in re2c. It works fine! We can distinguish StartEndTag with /> vs. StartTag with >. If the matches are equal length, then it picks the first one. Our rule in lazylex/html.py is different - the first match - but it also works in this case. --- doctools/micro-syntax.sh | 14 ++++++++++++ doctools/micro_syntax.cc | 19 +++++++++++++++- doctools/micro_syntax.re2c.h | 43 ++++++++++++++++++++++++++++++++++++ lazylex/html.py | 10 +++++---- 4 files changed, 81 insertions(+), 5 deletions(-) diff --git a/doctools/micro-syntax.sh b/doctools/micro-syntax.sh index f6af485738..7afc43b4e4 100755 --- a/doctools/micro-syntax.sh +++ b/doctools/micro-syntax.sh @@ -213,6 +213,13 @@ readonly -a R_TESTS=( 3')" ) +readonly -a HTML_TESTS=( + '

hi

' + 'hi
' + '' + 'link' +) + run-cases() { local lang=$1 shift @@ -246,6 +253,12 @@ test-R() { run-cases R "${R_TESTS[@]}" } +test-html() { + build + run-cases html "${HTML_TESTS[@]}" +} + + run-tests() { local bin=$BASE_DIR/micro_syntax @@ -255,6 +268,7 @@ run-tests() { run-cases cpp "${CPP_TESTS[@]}" run-cases py "${PY_TESTS[@]}" run-cases R "${R_TESTS[@]}" + run-cases html "${HTML_TESTS[@]}" # No language specified echo '==== No language' diff --git a/doctools/micro_syntax.cc b/doctools/micro_syntax.cc index e3cb64de58..814169cae0 100644 --- a/doctools/micro_syntax.cc +++ b/doctools/micro_syntax.cc @@ -49,6 +49,7 @@ enum class lang_e { Py, Shell, Ysh, // ''' etc. + Html, Asdl, R, // uses # comments @@ -328,6 +329,15 @@ class AnsiPrinter : public Printer { PrintColor(GREEN, p_start, num_bytes); break; + case Id::StartTag: + case Id::EndTag: + PrintColor(PURPLE, p_start, num_bytes); + break; + + case Id::StartEndTag: + PrintColor(RED2, p_start, num_bytes); + break; + case Id::Unknown: // Make errors red fputs(REVERSE, stdout); @@ -955,6 +965,10 @@ int ScanFiles(const Flags& flag, std::vector files, OutputStream* out, status = ScanOne(reader, out, hook); break; + case lang_e::Html: + status = ScanOne(reader, out, hook); + break; + default: assert(0); } @@ -1040,6 +1054,9 @@ int main(int argc, char** argv) { } else if (strcmp(optarg, "yaml") == 0) { flag.lang = lang_e::PlainText; + } else if (strcmp(optarg, "html") == 0) { + flag.lang = lang_e::Html; + } else if (strcmp(optarg, "txt") == 0) { flag.lang = lang_e::PlainText; @@ -1047,7 +1064,7 @@ int main(int argc, char** argv) { flag.lang = lang_e::PlainText; } else { - Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|txt, " + Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|html|txt, " "got %s", optarg); return 2; diff --git a/doctools/micro_syntax.re2c.h b/doctools/micro_syntax.re2c.h index 7f847c5d76..5a218f63aa 100644 --- a/doctools/micro_syntax.re2c.h +++ b/doctools/micro_syntax.re2c.h @@ -41,6 +41,11 @@ enum class Id { HereBegin, HereEnd, + // Html + StartTag, + EndTag, + StartEndTag, + // Zero-width token to detect #ifdef and Python INDENT/DEDENT // StartLine, @@ -669,6 +674,44 @@ bool Matcher::Match(Lexer* lexer, Token* tok) { return false; } +enum class html_mode_e { + Outer, +}; + +// Returns whether EOL was hit +template <> +bool Matcher::Match(Lexer* lexer, Token* tok) { + const char* p = lexer->p_current; // mutated by re2c + const char* YYMARKER = p; + + switch (lexer->line_mode) { + case html_mode_e::Outer: + while (true) { + /*!re2c + nul { return true; } + + // Like _NAME in HTM8 + name = [a-zA-Z][a-zA-Z0-9:_-]* ; + + '' { TOK(Id::EndTag); } + '<' name [^>\x00]* '/>' { TOK(Id::StartEndTag); } + '<' name [^>\x00]* '>' { TOK(Id::StartTag); } + + // TODO: Fill in the rest of the HTM8 lexer. + + * { TOK(Id::Other); } + + */ + } + break; + } + + tok->end_col = p - lexer->line_; + lexer->p_current = p; + return false; +} + + // TODO: // - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###" // - same as C++ raw string, I think diff --git a/lazylex/html.py b/lazylex/html.py index a80fd65e31..abdff063ce 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -218,7 +218,8 @@ def MakeLexer(rules): # That's done in the tag lexer. # - We don't allow leading whitespace (r'' % _NAME, Tok.EndTag), - # self-closing
comes before StarttTag + # self-closing
comes before StartTag + # could/should these be collapsed into one rule? (r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end (r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start @@ -441,7 +442,9 @@ def ValidTokenList(s, no_special_tags=False): # # Allow - for td-attrs -_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens +# allow underscore/hyphen. what about colons, like _NAME? +# what about href=$foo ? +_UNQUOTED_VALUE = r'[a-zA-Z0-9_\-]+' # TODO: we don't need to capture the tag name here? That's done at the top # level @@ -459,10 +462,9 @@ def ValidTokenList(s, no_special_tags=False): (?: " ([^>"]*) " # double quoted value | (%s) # Attribute value - # TODO: relax this? for href=$foo ) )? -''' % (_NAME, _ATTR_VALUE), re.VERBOSE) +''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE) TagName, AttrName, UnquotedValue, QuotedValue = range(4)