From b450511768bc0651fe3c7de325ebbc24975761f3 Mon Sep 17 00:00:00 2001 From: Andy C Date: Mon, 13 Jan 2025 14:36:58 -0500 Subject: [PATCH] [lazylex/html] Lex > separately, and convert it to > in XML. Added tests. --- lazylex/html.py | 16 ++++++++++++---- lazylex/html_test.py | 25 +++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/lazylex/html.py b/lazylex/html.py index 12e7ed597..7e1fa23c6 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -113,7 +113,7 @@ def Print(self, s): # HTML Tokens # CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible -TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand Invalid EndOfStream'.split( +TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData BadAmpersand BadGreaterThan BadLessThan Invalid EndOfStream'.split( ) @@ -226,8 +226,10 @@ def MakeLexer(rules): # # - My early blog has THREE errors when disallowing > # - So do some .wwz files - (r'[^&<\x00]+', Tok.RawData), - (r'.', Tok.Invalid), # error! + (r'[^&<>\x00]+', Tok.RawData), + (r'>', Tok.BadGreaterThan), + # < is an error + (r'.', Tok.Invalid), ] # Old notes: @@ -781,7 +783,8 @@ def ToText(s, left_pos=0, right_pos=-1): pos = left_pos for tok_id, end_pos in ValidTokens(s, left_pos, right_pos): - if tok_id in (Tok.RawData, Tok.BadAmpersand): + if tok_id in (Tok.RawData, Tok.BadAmpersand, Tok.BadGreaterThan, + Tok.BadLessThan): out.SkipTo(pos) out.PrintUntil(end_pos) @@ -974,6 +977,11 @@ def ToXml(htm8_str): #out.SkipTo(pos) out.Print('&') out.SkipTo(end_pos) + + elif tok_id == Tok.BadGreaterThan: + #out.SkipTo(pos) + out.Print('>') + out.SkipTo(end_pos) else: out.PrintUntil(end_pos) diff --git a/lazylex/html_test.py b/lazylex/html_test.py index 5e40ff710..e02e1c4df 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -336,6 +336,25 @@ def testStartTag(self): (Tok.EndOfStream, 9), ], tokens) + def testBad(self): + Tok = html.Tok + + h = '&' + tokens = Lex(h) + + self.assertEqual([ + (Tok.BadAmpersand, 1), + (Tok.EndOfStream, 1), + ], tokens) + + h = '>' + tokens = Lex(h) + + self.assertEqual([ + (Tok.BadGreaterThan, 1), + (Tok.EndOfStream, 1), + ], tokens) + def testInvalid(self): Tok = html.Tok @@ -406,10 +425,12 @@ def testValid(self): ('', UNCHANGED), # allowed, but 3 < 4 is not allowed - (' 3 > 4 ', ''), + (' 3 > 4 ', ' 3 > 4 '), # allowed, but 3 > 4 is not allowed ('

', ''), - ('link', ''), + ('link', UNCHANGED), + + # TODO: should be self-closing #('', ''), ('', ''),