Skip to content

Commit

Permalink
[lazylex/html] Lex > separately, and convert it to > in XML.
Browse files Browse the repository at this point in the history
Added tests.
  • Loading branch information
Andy C committed Jan 13, 2025
1 parent 03e5cd0 commit b450511
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 6 deletions.
16 changes: 12 additions & 4 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def Print(self, s):

# HTML Tokens
# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand Invalid EndOfStream'.split(
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split(
)


Expand Down Expand Up @@ -226,8 +226,10 @@ def MakeLexer(rules):
#
# - My early blog has THREE errors when disallowing >
# - So do some .wwz files
(r'[^&<\x00]+', Tok.RawData),
(r'.', Tok.Invalid), # error!
(r'[^&<>\x00]+', Tok.RawData),
(r'>', Tok.BadGreaterThan),
# < is an error
(r'.', Tok.Invalid),
]

# Old notes:
Expand Down Expand Up @@ -781,7 +783,8 @@ def ToText(s, left_pos=0, right_pos=-1):

pos = left_pos
for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
if tok_id in (Tok.RawData, Tok.BadAmpersand):
if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan,
Tok.BadLessThan):
out.SkipTo(pos)
out.PrintUntil(end_pos)

Expand Down Expand Up @@ -974,6 +977,11 @@ def ToXml(htm8_str):
#out.SkipTo(pos)
out.Print('&amp;')
out.SkipTo(end_pos)

elif tok_id == Tok.BadGreaterThan:
#out.SkipTo(pos)
out.Print('&gt;')
out.SkipTo(end_pos)
else:
out.PrintUntil(end_pos)

Expand Down
25 changes: 23 additions & 2 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,25 @@ def testStartTag(self):
(Tok.EndOfStream, 9),
], tokens)

def testBad(self):
Tok = html.Tok

h = '&'
tokens = Lex(h)

self.assertEqual([
(Tok.BadAmpersand, 1),
(Tok.EndOfStream, 1),
], tokens)

h = '>'
tokens = Lex(h)

self.assertEqual([
(Tok.BadGreaterThan, 1),
(Tok.EndOfStream, 1),
], tokens)

def testInvalid(self):
Tok = html.Tok

Expand Down Expand Up @@ -406,10 +425,12 @@ def testValid(self):
('<script><![CDATA[ <wtf> >< ]]></script>', UNCHANGED),

# allowed, but 3 < 4 is not allowed
('<a> 3 > 4 </a>', ''),
('<a> 3 > 4 </a>', '<a> 3 &gt; 4 </a>'),
# allowed, but 3 > 4 is not allowed
('<p x="3 < 4"></p>', ''),
('<b><a href="foo">link</a></b>', ''),
('<b><a href="foo">link</a></b>', UNCHANGED),

# TODO: should be self-closing
#('<meta><a></a>', '<meta/><a></a>'),
('<meta><a></a>', ''),

Expand Down

0 comments on commit b450511

Please sign in to comment.