Skip to content

Commit

Permalink
[lazylex] Fix lessing of <p empty= missing>
Browse files Browse the repository at this point in the history
That's two attributes in HTML5.  Not an attribute and a value.
  • Loading branch information
Andy C committed Jan 12, 2025
1 parent bc413da commit d5fccf5
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 29 deletions.
13 changes: 8 additions & 5 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,6 @@
# data_lang/htm8-test.sh
#
# TODO:
# - Validate()
# - add LEX_QUOTED_VALUES, along with counter for it
# - and then re-run all the tests - make sure they pass
# - site oils.pub, site oilshell.org
# - XML on my machine - turn that in to 'WILD' corpus for HTML/XML?
# - Rename to data_lang/htm8.py
# - it has NO_SPECIAL_TAGS mode for XML
# - put iterators at a higher level in doctools/ ?
Expand All @@ -28,6 +23,14 @@
# - there are over 2000 of them, not sure I want to build them all into the Oils binaries
# - capital letters <TR/> - I guess we can normalize the case
#
# Leniency:
# - foo=1&bar=2 is extremely common
# - well then does that mean you allow <p>a & b</b too?
# - and then it's not far from that to <p id="value >"> - the quotes help
# - I guess you can have a rule for unescaped &, just like unescaped backslash
# - you can warn about it, but it doesn't cause much problem?
# We are already firmly in HTML territory, not in XML ...
#
# Features:
# - Are there special rules for <svg> and <math>?
# - Do we need to know about <textarea> <pre>? Those don't have the same
Expand Down
8 changes: 5 additions & 3 deletions doc/htm8.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,16 @@ Just emit it! This always works, by design.

### What Doesn't This Cover?

- single-quoted attributes?
- We should probably add those, it shouldn't be hard?

- Encodings other than UTF-8. HTM8 is always UTF-8.
- Unicode Tag names and attribute names.
- This is allowed in HTML5 and XML.
- We leave those out for simpler lexing. Text and attribute values may be unicode.

- `<a href=">">` - no literal `>` inside quotes
- HTML5 handles it, but we want to easily scan the "top level" structure of the doc
- And it doesn't appear to be common in our testdata
- TODO: we will handle `<a href="&">`

There are 5 kinds of tags:

- Normal HTML tags
Expand Down
41 changes: 23 additions & 18 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,12 +459,18 @@ def ValidTokenList(s, no_special_tags=False):

# <button disabled> is standard usage

# NOTE: This used to allow whitespace around =
# <a foo = "bar"> makes sense in XML
# But then you also have
# <a foo= bar> - which is TWO attributes, in HTML5
# So the space is problematic

_ATTR_RE = re.compile(
r'''
\s+ # Leading whitespace is required
(%s) # Attribute name
(?: # Optional attribute value
\s* = \s*
=
(?:
" ([^>"\x00]*) " # double quoted value
| ' ([^>'\x00]*) ' # single quoted value
Expand All @@ -473,7 +479,7 @@ def ValidTokenList(s, no_special_tags=False):
)?
''' % (_NAME, _UNQUOTED_VALUE), re.VERBOSE)

TagName, AttrName, UnquotedValue, QuotedValue = range(4)
TagName, AttrName, UnquotedValue, QuotedValue, MissingValue = range(5)


class TagLexer(object):
Expand Down Expand Up @@ -523,12 +529,10 @@ def GetSpanForAttrValue(self, attr_name):
if name == attr_name:
# The value should come next
tok_id, start, end = next(events)
if tok_id in (QuotedValue, UnquotedValue):
# Note: quoted values may have &amp;
# We would need ANOTHER lexer to unescape them.
# Right now help_gen.py and oils_doc.py
val = start, end
break
assert tok_id in (QuotedValue, UnquotedValue,
MissingValue), TokenName(tok_id)
val = start, end
break

except StopIteration:
pass
Expand Down Expand Up @@ -557,16 +561,12 @@ def AllAttrsRawSlice(self):

# The value should come next
tok_id, start, end = next(events)
if tok_id in (QuotedValue, UnquotedValue):
# Note: quoted values may have &amp;
# We would need ANOTHER lexer to unescape them, but we
# don't need that for ul-table
slices.append((name, start, end))
else:
# TODO: no attribute? <button disabled>? Make it equivalent
# to the empty string? Or None?
pass
#slices.append((name, start, end))
assert tok_id in (QuotedValue, UnquotedValue,
MissingValue), TokenName(tok_id)
# Note: quoted values may have &amp;
# We would need ANOTHER lexer to unescape them, but we
# don't need that for ul-table
slices.append((name, start, end))
except StopIteration:
pass
return slices
Expand Down Expand Up @@ -612,6 +612,7 @@ def Tokens(self):

yield AttrName, m.start(1), m.end(1)

#log('m.groups() %r', m.groups())
if m.group(2) is not None:
# double quoted
yield QuotedValue, m.start(2), m.end(2)
Expand All @@ -620,6 +621,10 @@ def Tokens(self):
yield QuotedValue, m.start(3), m.end(3)
elif m.group(4) is not None:
yield UnquotedValue, m.start(4), m.end(4)
else:
# <button disabled>
end = m.end(0)
yield MissingValue, end, end

# Skip past the "
pos = m.end(0)
Expand Down
31 changes: 28 additions & 3 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ def testDotAll(self):
p4 = re.compile(r'[^>]')
print(p4.match('\n'))

def testAttrRe(self):
_ATTR_RE = html._ATTR_RE
m = _ATTR_RE.match(' empty= missing')
print(m.groups())


class FunctionsTest(unittest.TestCase):

Expand Down Expand Up @@ -69,9 +74,9 @@ def testTagLexer(self):
_PrintTokens(lex)

# Note: we could have a different HasAttr() method
# <a novalue> means lex.Get('novalue') == None
# <a novalue> means lex.Get('novalue') == ''
# https://developer.mozilla.org/en-US/docs/Web/API/Element/hasAttribute
self.assertEqual(None, lex.GetAttrRaw('novalue'))
self.assertEqual('', lex.GetAttrRaw('novalue'))

lex = _MakeTagLexer('<a href="double quoted">')
_PrintTokens(lex)
Expand Down Expand Up @@ -104,12 +109,32 @@ def testAllAttrs(self):
lex = _MakeTagLexer('<a href="?foo=1&amp;bar=2" />')
self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())

def testAttrWithoutValue(self):
def testEmptyMissingValues(self):
# equivalent to <button disabled="">
lex = _MakeTagLexer('<button disabled>')
all_attrs = lex.AllAttrsRaw()
self.assertEqual([('disabled', '')], all_attrs)

slices = lex.AllAttrsRawSlice()
log('slices %s', slices)

lex = _MakeTagLexer(
'''<p double="" single='' empty= missing missing2>''')
all_attrs = lex.AllAttrsRaw()
self.assertEqual([
('double', ''),
('single', ''),
('empty', ''),
('missing', ''),
('missing2', ''),
], all_attrs)
# TODO: should have
log('all %s', all_attrs)

slices = lex.AllAttrsRawSlice()
log('slices %s', slices)

def testInvalidTag(self):
try:
lex = _MakeTagLexer('<a foo=bar !></a>')
all_attrs = lex.AllAttrsRaw()
Expand Down

0 comments on commit d5fccf5

Please sign in to comment.