Skip to content

Commit

Permalink
[lazylex/html] Debug conflict between HTML5 and XML
Browse files Browse the repository at this point in the history
In XML, <style> <script> and void tags like <source> are not special
  • Loading branch information
Andy C committed Jan 11, 2025
1 parent 91263bb commit 72d3784
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 4 deletions.
37 changes: 34 additions & 3 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,18 @@
TODO: This should be an Oils library eventually. It's a "lazily-parsed data
structure" like TSV8
Conflicts between HTML5 and XML:
- In XML, <source> is like any tag, and must be closed,
- In HTML, <source> is a VOID tag, and must NOT be closedlike any tag, and must be closed,
- In XML, <script> and <style> don't have special treatment
- In HTML, they do
- The header is different - <!DOCTYPE html> vs. <?xml version= ... ?>
So do have a mode for <script> <style> and void tags? Upgrade HX8 into HTM8?
"""
from __future__ import print_function

Expand Down Expand Up @@ -681,7 +693,25 @@ def ToText(s, left_pos=0, right_pos=-1):
def main(argv):
action = argv[1]

if action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
if action == 'tokens':
contents = sys.stdin.read()

lx = Lexer(contents)
start_pos = 0
while True:
tok_id, end_pos = lx.Read()
if tok_id == Tok.Invalid:
raise LexError(contents, start_pos)
if tok_id == Tok.EndOfStream:
break

frag = contents[start_pos:end_pos]
log('%d %s %r', end_pos, TokenName(tok_id), frag)
start_pos = end_pos

return 0

elif action in ('lex-tags', 'lex-attrs', 'lex-attr-values', 'well-formed'):
num_tokens = 0
num_start_tags = 0
num_start_end_tags = 0
Expand Down Expand Up @@ -729,6 +759,7 @@ def main(argv):
# Don't bother to check
if tag_name not in VOID_ELEMENTS:
tag_stack.append(tag_name)

max_tag_stack = max(max_tag_stack, len(tag_stack))
elif tok_id == Tok.EndTag:
try:
Expand All @@ -741,8 +772,8 @@ def main(argv):
actual = lx.TagName()
if expected != actual:
raise ParseError(
'Expected closing tag %r, got %r' %
(expected, actual),
'Got unexpected closing tag %r; opening tag was %r'
% (contents[start_pos:end_pos], expected),
s=contents,
start_pos=start_pos)

Expand Down
35 changes: 34 additions & 1 deletion lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,11 @@ def Lex(h):
print(repr(h))
lex = html.ValidTokens(h)
tokens = list(lex)
start_pos = 0
for tok_id, end_pos in tokens:
log('%d %s', end_pos, html.TokenName(tok_id))
frag = h[start_pos:end_pos]
log('%d %s %r', end_pos, html.TokenName(tok_id), frag)
start_pos = end_pos
return tokens


Expand Down Expand Up @@ -229,6 +232,36 @@ def testStartTag(self):
(Tok.EndOfStream, 9),
], tokens)

# Make sure we don't consume too much
h = '<a><source>1.7</source></a>'

tokens = Lex(h)

self.assertEqual([
(Tok.StartTag, 3),
(Tok.StartTag, 11),
(Tok.RawData, 14),
(Tok.EndTag, 23),
(Tok.EndTag, 27),
(Tok.EndOfStream, 27),
], tokens)

return

h = '''
<configuration>
<source>1.7</source>
</configuration>'''

tokens = Lex(h)

self.assertEqual([
(Tok.RawData, 9),
(Tok.StartTag, 24),
(Tok.RawData, 9),
(Tok.EndOfStream, 9),
], tokens)

def testInvalid(self):
Tok = html.Tok

Expand Down

0 comments on commit 72d3784

Please sign in to comment.