Skip to content

Commit

Permalink
[doctools/micro-syntax] Start HTM8 lexer.
Browse files Browse the repository at this point in the history
The motivation was to test some HTM8 rules in re2c.

It works fine!  We can distinguish StartEndTag with /> vs. StartTag with
>.

If the matches are equal length, then it picks the first one.

Our rule in lazylex/html.py is different - the first match - but it also
works in this case.
  • Loading branch information
Andy C committed Jan 11, 2025
1 parent 71c791e commit 06efda8
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 5 deletions.
14 changes: 14 additions & 0 deletions doctools/micro-syntax.sh
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,13 @@ readonly -a R_TESTS=(
3')"
)

readonly -a HTML_TESTS=(
'<p>hi</p>'
'hi <br/>'
'<img src="foo"/>'
'<a href=foo>link</a>'
)

run-cases() {
local lang=$1
shift
Expand Down Expand Up @@ -246,6 +253,12 @@ test-R() {
run-cases R "${R_TESTS[@]}"
}

test-html() {
build
run-cases html "${HTML_TESTS[@]}"
}


run-tests() {
local bin=$BASE_DIR/micro_syntax

Expand All @@ -255,6 +268,7 @@ run-tests() {
run-cases cpp "${CPP_TESTS[@]}"
run-cases py "${PY_TESTS[@]}"
run-cases R "${R_TESTS[@]}"
run-cases html "${HTML_TESTS[@]}"

# No language specified
echo '==== No language'
Expand Down
19 changes: 18 additions & 1 deletion doctools/micro_syntax.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ enum class lang_e {
Py,
Shell,
Ysh, // ''' etc.
Html,
Asdl,
R, // uses # comments

Expand Down Expand Up @@ -328,6 +329,15 @@ class AnsiPrinter : public Printer {
PrintColor(GREEN, p_start, num_bytes);
break;

case Id::StartTag:
case Id::EndTag:
PrintColor(PURPLE, p_start, num_bytes);
break;

case Id::StartEndTag:
PrintColor(RED2, p_start, num_bytes);
break;

case Id::Unknown:
// Make errors red
fputs(REVERSE, stdout);
Expand Down Expand Up @@ -955,6 +965,10 @@ int ScanFiles(const Flags& flag, std::vector<char*> files, OutputStream* out,
status = ScanOne<R_mode_e>(reader, out, hook);
break;

case lang_e::Html:
status = ScanOne<html_mode_e>(reader, out, hook);
break;

default:
assert(0);
}
Expand Down Expand Up @@ -1040,14 +1054,17 @@ int main(int argc, char** argv) {
} else if (strcmp(optarg, "yaml") == 0) {
flag.lang = lang_e::PlainText;

} else if (strcmp(optarg, "html") == 0) {
flag.lang = lang_e::Html;

} else if (strcmp(optarg, "txt") == 0) {
flag.lang = lang_e::PlainText;

} else if (strcmp(optarg, "other") == 0) {
flag.lang = lang_e::PlainText;

} else {
Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|txt, "
Log("Expected -l LANG to be cpp|py|shell|asdl|R|js|css|md|yaml|html|txt, "
"got %s",
optarg);
return 2;
Expand Down
43 changes: 43 additions & 0 deletions doctools/micro_syntax.re2c.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ enum class Id {
HereBegin,
HereEnd,

// Html
StartTag,
EndTag,
StartEndTag,

// Zero-width token to detect #ifdef and Python INDENT/DEDENT
// StartLine,

Expand Down Expand Up @@ -669,6 +674,44 @@ bool Matcher<sh_mode_e>::Match(Lexer<sh_mode_e>* lexer, Token* tok) {
return false;
}

enum class html_mode_e {
Outer,
};

// Returns whether EOL was hit
template <>
bool Matcher<html_mode_e>::Match(Lexer<html_mode_e>* lexer, Token* tok) {
const char* p = lexer->p_current; // mutated by re2c
const char* YYMARKER = p;

switch (lexer->line_mode) {
case html_mode_e::Outer:
while (true) {
/*!re2c
nul { return true; }
// Like _NAME in HTM8
name = [a-zA-Z][a-zA-Z0-9:_-]* ;
'</' name '>' { TOK(Id::EndTag); }
'<' name [^>\x00]* '/>' { TOK(Id::StartEndTag); }
'<' name [^>\x00]* '>' { TOK(Id::StartTag); }
// TODO: Fill in the rest of the HTM8 lexer.
* { TOK(Id::Other); }
*/
}
break;
}

tok->end_col = p - lexer->line_;
lexer->p_current = p;
return false;
}


// TODO:
// - Lua / Rust-style multi-line strings, with matching delimiters e.g. r###"
// - same as C++ raw string, I think
Expand Down
10 changes: 6 additions & 4 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,8 @@ def MakeLexer(rules):
# That's done in the tag lexer.
# - We don't allow leading whitespace
(r'</ (%s) >' % _NAME, Tok.EndTag),
# self-closing <br/> comes before StarttTag
# self-closing <br/> comes before StartTag
# could/should these be collapsed into one rule?
(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>

Expand Down Expand Up @@ -441,7 +442,9 @@ def ValidTokenList(s, no_special_tags=False):
#
# Allow - for td-attrs

_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens
# allow underscore/hyphen. what about colons, like _NAME?
# what about href=$foo ?
_UNQUOTED_VALUE = r'[a-zA-Z0-9_\-]+'

# TODO: we don't need to capture the tag name here? That's done at the top
# level
Expand All @@ -459,10 +462,9 @@ def ValidTokenList(s, no_special_tags=False):
(?:
" ([^>"]*) " # double quoted value
| (%s) # Attribute value
# TODO: relax this? for href=$foo
)
)?
''' % (_NAME, _ATTR_VALUE), re.VERBOSE)
''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)

TagName, AttrName, UnquotedValue, QuotedValue = range(4)

Expand Down

0 comments on commit 06efda8

Please sign in to comment.