Skip to content

Commit

Permalink
[lazylex/html] Refine/relax the lexer
Browse files Browse the repository at this point in the history
- Allow single quoted attr values, like <a foo='bar'>
- Allow <a foo=>, because <a foo> is also allowed
- Relax unquoted attribute values A LOT, because I use this in my own
  HTML
  • Loading branch information
Andy C committed Jan 12, 2025
1 parent 8af7e9c commit 32af775
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 19 deletions.
3 changes: 1 addition & 2 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,7 @@ find-xml() {
}

test-other-xml() {
# problem with &ent1;
# CDATA support! haha OK
# 6 errors, relating to value='<' in some Python testdata files, which seems invalid
time cat _tmp/xml-files.txt | $REPO_ROOT/$0 htm8-tool parse-xml
}

Expand Down
13 changes: 9 additions & 4 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -442,8 +442,9 @@ def ValidTokenList(s, no_special_tags=False):
#
# Allow - for td-attrs

# allow underscore/hyphen. what about colons, like _NAME?
_UNQUOTED_VALUE = r'[a-zA-Z0-9_\-]+'
# Be very lenient - just no whitespace or special HTML chars
# I don't think this is more lenient than HTML5, though we should check.
_UNQUOTED_VALUE = r'''[^\x00 \t\r\n<>&"']*'''

# TODO: we don't need to capture the tag name here? That's done at the top
# level
Expand All @@ -464,6 +465,7 @@ def ValidTokenList(s, no_special_tags=False):
\s* = \s*
(?:
" ([^>"]*) " # double quoted value
| ' ([^>']*) ' # single quoted value
| (%s) # Attribute value
)
)?
Expand Down Expand Up @@ -592,11 +594,14 @@ def Tokens(self):

yield AttrName, m.start(1), m.end(1)

# Quoted is group 2, unquoted is group 3.
if m.group(2) is not None:
# double quoted
yield QuotedValue, m.start(2), m.end(2)
elif m.group(3) is not None:
yield UnquotedValue, m.start(3), m.end(3)
# single quoted - TODO: could have different token types
yield QuotedValue, m.start(3), m.end(3)
elif m.group(4) is not None:
yield UnquotedValue, m.start(4), m.end(4)

# Skip past the "
pos = m.end(0)
Expand Down
32 changes: 20 additions & 12 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,13 @@ def testAttrWithoutValue(self):
all_attrs = lex.AllAttrsRaw()
log('all %s', all_attrs)

return
lex = _MakeTagLexer('<a foo=bar !></a>')
all_attrs = lex.AllAttrsRaw()
log('all %s', all_attrs)
try:
lex = _MakeTagLexer('<a foo=bar !></a>')
all_attrs = lex.AllAttrsRaw()
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')


def Lex(h, no_special_tags=False):
Expand Down Expand Up @@ -310,10 +313,6 @@ def testInvalid(self):
'<!-- unfinished comment',
'<? unfinished processing',
'</div bad=attr> <a> <b>',

# TODO: should be escaped, invalid in XML
#'<a href="&"></a>',
#'<a href=">"></a>',
]

INVALID_PARSE = [
Expand All @@ -327,14 +326,19 @@ def testInvalid(self):
'<meta><a></a>',
# no attribute
'<button disabled></button>',
'<button disabled=></button>',
'<button disabled= ></button>',

# single quoted is pretty common
"<a href='single'></a>",

# Conceding to reality - I used these myself
'<a href=ble.sh></a>',
'<a href=foo.html></a>',

# TODO: capitalization should be allowed
#'<META><a></a>',

# TODO:
#'<a foo="&"></a>', # bad attr
#'<a foo=bar !></a>', # bad attr

# TODO: Test <svg> and <math> ?
]

Expand All @@ -344,6 +348,10 @@ def testInvalid(self):

INVALID_TAG_LEX = [
'<a foo=bar !></a>', # bad attr

# TODO: should be escaped, invalid in XML
#'<a href="&"></a>',
#'<a href=">"></a>',
]


Expand Down
6 changes: 5 additions & 1 deletion soil/web-worker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -177,8 +177,12 @@ EOF
</li>
'
fi

echo '</ul>'

cat <<EOF
</body>
</html>
EOF
}

format-image-stats() {
Expand Down

0 comments on commit 32af775

Please sign in to comment.