Skip to content

Commit

Permalink
[lazylex/html] Add lexer for attribute values
Browse files Browse the repository at this point in the history
They can be double quoted, single quoted, or unquoted.

This caught many errors of the form "foo=42&bar=2", instead of using
&  I guess HTML5 probably accepts that ...

- Disallow \x00 to be prepare for re2c - it's the sentinel
  - This caught some problems in my corpus

- Add tests around the assymetry of < and >
  - Technically we could be smarter here too?

TODO: Do some tests around empty values:

    <p id="">
    <p id=''>
    <p id=>
    <p id>
  • Loading branch information
Andy C committed Jan 12, 2025
1 parent 32af775 commit bc413da
Show file tree
Hide file tree
Showing 3 changed files with 173 additions and 46 deletions.
2 changes: 1 addition & 1 deletion data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ test-site() {
popd
}

readonly SOIL_ID=8917
readonly SOIL_ID=8924
readonly WWZ_DIR=_tmp/$SOIL_ID

sync-wwz() {
Expand Down
159 changes: 128 additions & 31 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,15 @@ def MakeLexer(rules):

_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*' # must start with letter

LEXER = [
CHAR_LEX = [
# Characters
# https://www.w3.org/TR/xml/#sec-references
(r'&\# [0-9]+ ;', Tok.DecChar),
(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
(r'& %s ;' % _NAME, Tok.CharEntity),
]

LEXER = CHAR_LEX + [
(r'<!--', Tok.CommentBegin),

# Processing instruction are used for the XML header:
Expand All @@ -210,7 +218,7 @@ def MakeLexer(rules):
# - these seem to be part of DTD
# - it's useful to skip these, and be able to parse the rest of the document
# - Note: < is allowed?
(r'<! [^>]+ >', Tok.Decl),
(r'<! [^>\x00]+ >', Tok.Decl),

# Tags
# Notes:
Expand All @@ -220,21 +228,15 @@ def MakeLexer(rules):
(r'</ (%s) >' % _NAME, Tok.EndTag),
# self-closing <br/> comes before StartTag
# could/should these be collapsed into one rule?
(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>

# Characters
# https://www.w3.org/TR/xml/#sec-references
(r'&\# [0-9]+ ;', Tok.DecChar),
(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
(r'& %s ;' % _NAME, Tok.CharEntity),
(r'< (%s) [^>\x00]* />' % _NAME, Tok.StartEndTag), # end </a>
(r'< (%s) [^>\x00]* >' % _NAME, Tok.StartTag), # start <a>

# HTML5 allows unescaped > in raw data, but < is not allowed.
# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
#
# - My early blog has THREE errors when disallowing >
# - So do some .wwz files
(r'[^&<]+', Tok.RawData),
(r'[^&<\x00]+', Tok.RawData),
(r'.', Tok.Invalid), # error!
]

Expand Down Expand Up @@ -444,7 +446,7 @@ def ValidTokenList(s, no_special_tags=False):

# Be very lenient - just no whitespace or special HTML chars
# I don't think this is more lenient than HTML5, though we should check.
_UNQUOTED_VALUE = r'''[^\x00 \t\r\n<>&"']*'''
_UNQUOTED_VALUE = r'''[^ \t\r\n<>&"'\x00]*'''

# TODO: we don't need to capture the tag name here? That's done at the top
# level
Expand All @@ -464,8 +466,8 @@ def ValidTokenList(s, no_special_tags=False):
(?: # Optional attribute value
\s* = \s*
(?:
" ([^>"]*) " # double quoted value
| ' ([^>']*) ' # single quoted value
" ([^>"\x00]*) " # double quoted value
| ' ([^>'\x00]*) ' # single quoted value
| (%s) # Attribute value
)
)?
Expand All @@ -490,6 +492,9 @@ def __init__(self, s):

def Reset(self, start_pos, end_pos):
"""Reuse instances of this object."""
assert start_pos >= 0, start_pos
assert end_pos >= 0, end_pos

self.start_pos = start_pos
self.end_pos = end_pos

Expand Down Expand Up @@ -538,14 +543,11 @@ def GetAttrRaw(self, attr_name):
return None
return self.s[start:end]

def AllAttrsRaw(self):
def AllAttrsRawSlice(self):
"""
Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
The quoted values may be escaped. We would need another lexer to
unescape them.
Get a list of pairs [('class', 3, 5), ('href', 9, 12)]
"""
pairs = []
slices = []
events = self.Tokens()
try:
while True:
Expand All @@ -559,11 +561,27 @@ def AllAttrsRaw(self):
# Note: quoted values may have &amp;
# We would need ANOTHER lexer to unescape them, but we
# don't need that for ul-table

val = self.s[start:end]
pairs.append((name, val))
slices.append((name, start, end))
else:
# TODO: no attribute? <button disabled>? Make it equivalent
# to the empty string? Or None?
pass
#slices.append((name, start, end))
except StopIteration:
pass
return slices

def AllAttrsRaw(self):
"""
Get a list of pairs [('class', 'foo'), ('href', '?foo=1&amp;bar=2')]
The quoted values may be escaped. We would need another lexer to
unescape them.
"""
slices = self.AllAttrsRawSlice()
pairs = []
for name, start, end in slices:
pairs.append((name, self.s[start:end]))
return pairs

def Tokens(self):
Expand Down Expand Up @@ -615,6 +633,72 @@ def Tokens(self):
raise LexError(self.s, pos)


# This is similar but not identical to
# " ([^>"\x00]*) " # double quoted value
# | ' ([^>'\x00]*) ' # single quoted value
#
# Note: for unquoted values, & isn't allowed, and thus &amp; and &#99; and
# &#x99; are not allowed. We could relax that?
ATTR_VALUE_LEXER = CHAR_LEX + [
(r'[^>&\x00]+', Tok.RawData),
(r'.', Tok.Invalid),
]

ATTR_VALUE_LEXER = MakeLexer(ATTR_VALUE_LEXER)


class AttrValueLexer(object):
"""
<a href="foo=99&amp;bar">
<a href='foo=99&amp;bar'>
<a href=unquoted>
"""

def __init__(self, s):
self.s = s
self.start_pos = -1 # Invalid
self.end_pos = -1

def Reset(self, start_pos, end_pos):
"""Reuse instances of this object."""
assert start_pos >= 0, start_pos
assert end_pos >= 0, end_pos

self.start_pos = start_pos
self.end_pos = end_pos

def NumTokens(self):
num_tokens = 0
pos = self.start_pos
for tok_id, end_pos in self.Tokens():
if tok_id == Tok.Invalid:
raise LexError(self.s, pos)
pos = end_pos
#log('pos %d', pos)
num_tokens += 1
return num_tokens

def Tokens(self):
pos = self.start_pos
while pos < self.end_pos:
# Find the first match, like above.
# Note: frontend/match.py uses _LongestMatch(), which is different!
# TODO: reconcile them. This lexer should be expressible in re2c.
for pat, tok_id in ATTR_VALUE_LEXER:
m = pat.match(self.s, pos)
if m:
if 0:
tok_str = m.group(0)
log('token = %r', tok_str)

end_pos = m.end(0)
yield tok_id, end_pos
pos = end_pos
break
else:
raise AssertionError('Tok.Invalid rule should have matched')


def ReadUntilStartTag(it, tag_lexer, tag_name):
"""Find the next <foo>, returning its (start, end) positions
Expand Down Expand Up @@ -739,13 +823,16 @@ def Validate(contents, flags, counters):
# type: (str, int, Counters) -> None

tag_lexer = TagLexer(contents)
val_lexer = AttrValueLexer(contents)

no_special_tags = bool(flags & NO_SPECIAL_TAGS)
lx = Lexer(contents, no_special_tags=no_special_tags)
tokens = []
start_pos = 0
tag_stack = []
while True:
tok_id, end_pos = lx.Read()
#log('TOP %s %r', TokenName(tok_id), contents[start_pos:end_pos])

if tok_id == Tok.Invalid:
raise LexError(contents, start_pos)
Expand All @@ -758,16 +845,24 @@ def Validate(contents, flags, counters):
counters.num_start_end_tags += 1

tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
all_attrs = tag_lexer.AllAttrsRawSlice()
counters.num_attrs += len(all_attrs)
for name, val_start, val_end in all_attrs:
val_lexer.Reset(val_start, val_end)
counters.num_val_tokens += val_lexer.NumTokens()

counters.debug_attrs.extend(all_attrs)

elif tok_id == Tok.StartTag:
counters.num_start_tags += 1

tag_lexer.Reset(start_pos, end_pos)
all_attrs = tag_lexer.AllAttrsRaw()
all_attrs = tag_lexer.AllAttrsRawSlice()
counters.num_attrs += len(all_attrs)
for name, val_start, val_end in all_attrs:
val_lexer.Reset(val_start, val_end)
counters.num_val_tokens += val_lexer.NumTokens()

counters.debug_attrs.extend(all_attrs)

if flags & BALANCED_TAGS:
Expand Down Expand Up @@ -817,6 +912,7 @@ def __init__(self):
self.num_start_end_tags = 0
self.num_attrs = 0
self.max_tag_stack = 0
self.num_val_tokens = 0

self.debug_attrs = []

Expand Down Expand Up @@ -870,12 +966,13 @@ def main(argv):
i += 1

log('')
log(
' %d tokens, %d start/end tags, %d start tags, %d attrs, %d max tag stack depth in %d files',
counters.num_tokens, counters.num_start_end_tags,
counters.num_start_tags, counters.num_attrs,
counters.max_tag_stack, i)
log(' %d errors', len(errors))
log('%10d tokens', counters.num_tokens)
log('%10d start/end tags', counters.num_start_end_tags)
log('%10d start tags', counters.num_start_tags)
log('%10d attrs', counters.num_attrs)
log('%10d max tag stack depth', counters.max_tag_stack)
log('%10d attr val tokens', counters.num_val_tokens)
log('%10d errors', len(errors))
if len(errors):
return 1
return 0
Expand Down
58 changes: 44 additions & 14 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,6 @@
TEST_HTML = f.read()


def _MakeTagLexer(s):
lex = html.TagLexer(s)
lex.Reset(0, len(s))
return lex


def _PrintTokens(lex):
log('')
log('tag = %r', lex.TagName())
for tok, start, end in lex.Tokens():
log('%s %r', tok, lex.s[start:end])


class RegexTest(unittest.TestCase):

def testDotAll(self):
Expand Down Expand Up @@ -55,6 +42,19 @@ def testFindLineNum(self):
print(line_num)


def _MakeTagLexer(s):
lex = html.TagLexer(s)
lex.Reset(0, len(s))
return lex


def _PrintTokens(lex):
log('')
log('tag = %r', lex.TagName())
for tok, start, end in lex.Tokens():
log('%s %r', tok, lex.s[start:end])


class TagLexerTest(unittest.TestCase):

def testTagLexer(self):
Expand Down Expand Up @@ -119,6 +119,20 @@ def testAttrWithoutValue(self):
self.fail('Expected LexError')


def _MakeAttrValueLexer(s):
lex = html.AttrValueLexer(s)
lex.Reset(0, len(s))
return lex


class AttrValueLexerTest(unittest.TestCase):

def testGood(self):
lex = _MakeAttrValueLexer('?foo=42&amp;bar=99')
n = lex.NumTokens()
self.assertEqual(3, n)


def Lex(h, no_special_tags=False):
print(repr(h))
tokens = html.ValidTokenList(h, no_special_tags=no_special_tags)
Expand Down Expand Up @@ -313,6 +327,9 @@ def testInvalid(self):
'<!-- unfinished comment',
'<? unfinished processing',
'</div bad=attr> <a> <b>',

# not allowed, but 3 > 4 is allowed
'<a> 3 < 4 </a>',
]

INVALID_PARSE = [
Expand All @@ -322,6 +339,17 @@ def testInvalid(self):
]

VALID_PARSE = [
'<!DOCTYPE html>\n',
'<!DOCTYPE>',

# empty strings
'<p x=""></p>',
"<p x=''></p>",

# allowed, but 3 < 4 is not allowed
'<a> 3 > 4 </a>',
# allowed, but 3 > 4 is not allowed
'<p x="3 < 4"></p>',
'<b><a href="foo">link</a></b>',
'<meta><a></a>',
# no attribute
Expand All @@ -347,9 +375,11 @@ def testInvalid(self):
]

INVALID_TAG_LEX = [
# not allowed, but 3 < 4 is allowed
'<p x="3 > 4"></p>',
'<a foo=bar !></a>', # bad attr

# TODO: should be escaped, invalid in XML
# should be escaped
#'<a href="&"></a>',
#'<a href=">"></a>',
]
Expand Down

0 comments on commit bc413da

Please sign in to comment.