Skip to content

Commit

Permalink
[lazylex/html] Start conversion to XML
Browse files Browse the repository at this point in the history
Can correct > to &

This will help refine the TagLexer and AttrValueLexer APIs as well.
Although it's looking like significant work too.
  • Loading branch information
Andy C committed Jan 13, 2025
1 parent 4b795d5 commit 03e5cd0
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 10 deletions.
56 changes: 53 additions & 3 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -919,18 +919,68 @@ def Validate(contents, flags, counters):
counters.num_tokens += len(tokens)


def ToXml(h):
def ToXml(htm8_str):
# type: (str) -> str

# TODO:
# TODO:
# 1. Lex it
# 2. < & > must be escaped
# a. in raw data
# b. in quoted strings
# 3. <script> turned into CDATA
# 4. void tags turned into self-closing tags
# 5. case-sensitive tag matching - not sure about this
return h

tag_lexer = TagLexer(htm8_str)
val_lexer = AttrValueLexer(htm8_str)

f = StringIO()
out = Output(htm8_str, f)

lx = Lexer(htm8_str)

pos = 0
while True:
tok_id, end_pos = lx.Read()

if tok_id == Tok.Invalid:
raise LexError(htm8_str, pos)
if tok_id == Tok.EndOfStream:
break

if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
out.PrintUntil(end_pos)
elif tok_id in (Tok.StartTag, Tok.StartEndTag):
tag_lexer.Reset(pos, end_pos)
# TODO: reduce allocations here
all_attrs = tag_lexer.AllAttrsRawSlice()
for name, val_start, val_end in all_attrs:
val_lexer.Reset(val_start, val_end)
# TODO: get the kind of string
#
# Quoted: we need to replace & with &amp; and < with &lt;
# note > is not allowed
# Unquoted: right now, we can just surround with double quotes
# because we don't allow any bad chars
# Empty : add "", so empty= becomes =""
# Missing : add ="", so missing becomes missing=""

tag_name = lx.CanonicalTagName()
if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
# TODO: instead of closing >, print />
pass

elif tok_id == Tok.BadAmpersand:
#out.SkipTo(pos)
out.Print('&amp;')
out.SkipTo(end_pos)
else:
out.PrintUntil(end_pos)

pos = end_pos

out.PrintTheRest()
return f.getvalue()


class Counters(object):
Expand Down
16 changes: 9 additions & 7 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,14 +370,18 @@ def testValid(self):
'<STYLEz><</STYLEz>',
]

SKIP = 0
UNCHANGED = 1

VALID_LEX = [
# TODO: convert these to XML
('<foo></foo>', ''),
('<foo></foo>', UNCHANGED),
('<foo x=y></foo>', ''),
#('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
('<foo x="&"></foo>', ''),

# Allowed with BadAmpersand
('<p> x & y </p>', ''),
('<p> x & y </p>', '<p> x &amp; y </p>'),
]

INVALID_PARSE = [
Expand All @@ -386,20 +390,16 @@ def testValid(self):
'<meta></meta>', # this is a self-closing tag
]

SKIP = 0
UNCHANGED = 1

VALID_PARSE = [
('<!DOCTYPE html>\n', ''),
('<!DOCTYPE>', ''),

# empty strings
('<p x=""></p>', UNCHANGED),
("<p x=''></p>", UNCHANGED),

('<self-closing a="b" />', UNCHANGED),

# We could also normalize CDATA?
# We could also normalize CDATA?
# Note that CDATA has an escaping problem: you need to handle it ]]> with
# concatenation. It just "pushes the problem around".
# So I think it's better to use ONE kind of escaping, which is &lt;
Expand All @@ -410,7 +410,9 @@ def testValid(self):
# allowed, but 3 > 4 is not allowed
('<p x="3 < 4"></p>', ''),
('<b><a href="foo">link</a></b>', ''),
#('<meta><a></a>', '<meta/><a></a>'),
('<meta><a></a>', ''),

# no attribute
('<button disabled></button>', ''),
('<button disabled=></button>', ''),
Expand Down

0 comments on commit 03e5cd0

Please sign in to comment.