From 43084cf3839f26c283232014bda14a16247a7a54 Mon Sep 17 00:00:00 2001 From: Andy C Date: Fri, 10 Jan 2025 11:29:09 -0500 Subject: [PATCH] [lazylex/html] Support It's found in XML files. And refactor the unit tests. --- lazylex/html.py | 19 ++++-- lazylex/html_test.py | 152 +++++++++++++++++-------------------------- 2 files changed, 74 insertions(+), 97 deletions(-) diff --git a/lazylex/html.py b/lazylex/html.py index 5a6433432..7dadcfb21 100755 --- a/lazylex/html.py +++ b/lazylex/html.py @@ -80,8 +80,8 @@ def Print(self, s): # HTML Tokens -# CommentBegin and ProcessingBegin are "pseudo-tokens", not visible -TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData Invalid EndOfStream'.split( +# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible +TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split( ) @@ -169,6 +169,7 @@ def MakeLexer(rules): # They are used for the XML comment: # (r'<\?', Tok.ProcessingBegin), + (r']+ >', Tok.Decl), # @@ -229,14 +230,12 @@ def _Peek(self): raise LexError(self.s, self.pos) self.search_state = None # beginning - return Tok.CData, pos + return Tok.HtmlCData, pos # Find the first match. # Note: frontend/match.py uses _LongestMatch(), which is different! # TODO: reconcile them. This lexer should be expressible in re2c. - # TODO: Get rid of non-greedy match - for pat, tok_id in LEXER: m = pat.match(self.s, self.pos) if m: @@ -254,6 +253,13 @@ def _Peek(self): raise LexError(self.s, self.pos) return Tok.Processing, pos + 2 # ?> + if tok_id == Tok.CDataBegin: + pos = self.s.find(']]>', self.pos) + if pos == -1: + # unterminated + if tok_id == Tok.StartTag: tag_name = m.group(1) # captured if tag_name == 'script': @@ -331,7 +337,8 @@ def ValidTokens(s, left_pos=0, right_pos=-1): _ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens -# TODO: capture tag name above? +# TODO: we don't need to capture the tag name here? That's done at the top +# level _TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE) # To match href="foo" diff --git a/lazylex/html_test.py b/lazylex/html_test.py index f0eb372e8..de4a1c635 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -96,6 +96,15 @@ def testAllAttrs(self): self.assertEqual([('href', '?foo=1&bar=2')], lex.AllAttrsRaw()) +def Lex(h): + print(repr(h)) + lex = html.ValidTokens(h) + tokens = list(lex) + for tok_id, end_pos in tokens: + log('%d %s', end_pos, html.TokenName(tok_id)) + return tokens + + class LexerTest(unittest.TestCase): # IndexLinker in devtools/make_help.py @@ -122,116 +131,77 @@ def testCommentParse2(self): h = ''' hi
''' - print(repr(h)) - lex = html.ValidTokens(h) - - tok_id, pos = next(lex) - self.assertEqual(12, pos) - self.assertEqual(Tok.RawData, tok_id) - - tok_id, pos = next(lex) - log('tok %r', html.TokenName(tok_id)) - self.assertEqual(50, pos) - self.assertEqual(Tok.Comment, tok_id) + tokens = Lex(h) - tok_id, pos = next(lex) - self.assertEqual(55, pos) - self.assertEqual(Tok.StartEndTag, tok_id) - - tok_id, pos = next(lex) - self.assertEqual(55, pos) - self.assertEqual(Tok.EndOfStream, tok_id) + self.assertEqual( + [ + (Tok.RawData, 12), + (Tok.Comment, 50), # + (Tok.StartEndTag, 55), + (Tok.EndOfStream, 55), + ], + tokens) def testProcessingInstruction(self): - # The TOP level should understand the syntax, because otherwise - # it will be a start tag - + # header Tok = html.Tok h = 'hi ' - print(repr(h)) - lex = html.ValidTokens(h) - - tok_id, pos = next(lex) - self.assertEqual(3, pos) - self.assertEqual(Tok.RawData, tok_id) + tokens = Lex(h) - tok_id, pos = next(lex) - self.assertEqual(12, pos) - log('tok %r', html.TokenName(tok_id)) - self.assertEqual(Tok.Processing, tok_id) - - tok_id, pos = next(lex) - self.assertEqual(12, pos) - log('tok %r', html.TokenName(tok_id)) - self.assertEqual(Tok.EndOfStream, tok_id) + self.assertEqual( + [ + (Tok.RawData, 3), + (Tok.Processing, 12), # + (Tok.EndOfStream, 12), + ], + tokens) def testScriptStyle(self): - Tok = html.Tok h = ''' hi ''' - print(repr(h)) - lex = html.ValidTokens(h) - - tok_id, pos = next(lex) - self.assertEqual(12, pos) - self.assertEqual(Tok.RawData, tok_id) - - # - tok_id, pos = next(lex) - log('tok %r', html.TokenName(tok_id)) - self.assertEqual(87, pos) - self.assertEqual(Tok.EndTag, tok_id) - - def testValid(self): + tokens = Lex(h) + + self.assertEqual( + [ + (Tok.RawData, 12), + (Tok.StartTag, 27), # + (Tok.RawData, 96), # \n + (Tok.EndOfStream, 96), # \n + ], + tokens) + + def testCData(self): Tok = html.Tok - lex = html.ValidTokens('hi') - - tok_id, pos = next(lex) - self.assertEqual(3, pos) - self.assertEqual(Tok.StartTag, tok_id) + # from + # /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml + h = '"0" && value<"10" ?"valid":"error"]]>' + tokens = Lex(h) - tok_id, pos = next(lex) - self.assertEqual(5, pos) - self.assertEqual(Tok.RawData, tok_id) + self.assertEqual([ + (Tok.StartTag, 9), + (Tok.CData, 61), + (Tok.EndTag, 71), + (Tok.EndOfStream, 71), + ], tokens) - tok_id, pos = next(lex) - self.assertEqual(9, pos) - self.assertEqual(Tok.EndTag, tok_id) - - tok_id, pos = next(lex) - self.assertEqual(9, pos) - self.assertEqual(Tok.EndOfStream, tok_id) - - lex = html.Lexer('hi') - while True: - tok_id, pos = lex.Read() - print('%d %s' % (pos, html.TokenName(tok_id))) - if tok_id == Tok.EndOfStream: - break + def testStartTag(self): + Tok = html.Tok - return - tok_id, pos = next(lex) - self.assertEqual(9, pos) - self.assertEqual(Tok.EndOfStream, tok_id) + h = 'hi' + tokens = Lex(h) - while True: - tok_id, pos = next(lex) - print('%d %s' % (pos, html.TokenName(tok_id))) + self.assertEqual([ + (Tok.StartTag, 3), + (Tok.RawData, 5), + (Tok.EndTag, 9), + (Tok.EndOfStream, 9), + ], tokens) def testInvalid(self): Tok = html.Tok