Skip to content

Commit

Permalink
[lazylex/html] Support <![CDATA[ ... ]]>
Browse files Browse the repository at this point in the history
It's found in XML files.

And refactor the unit tests.
  • Loading branch information
Andy C committed Jan 10, 2025
1 parent 089f900 commit 43084cf
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 97 deletions.
19 changes: 13 additions & 6 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,8 @@ def Print(self, s):


# HTML Tokens
# CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData Invalid EndOfStream'.split(
# CommentBegin, ProcessingBegin, CDataBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin CData CDataBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData HtmlCData Invalid EndOfStream'.split(
)


Expand Down Expand Up @@ -169,6 +169,7 @@ def MakeLexer(rules):
# They are used for the XML comment:
# <?xml version="1.0" encoding="UTF-8"?>
(r'<\?', Tok.ProcessingBegin),
(r'<!\[CDATA\[', Tok.CDataBegin), # <![CDATA[

# NOTE: < is allowed in these?
(r'<! [^>]+ >', Tok.Decl), # <!DOCTYPE html>
Expand Down Expand Up @@ -229,14 +230,12 @@ def _Peek(self):
raise LexError(self.s, self.pos)
self.search_state = None
# beginning
return Tok.CData, pos
return Tok.HtmlCData, pos

# Find the first match.
# Note: frontend/match.py uses _LongestMatch(), which is different!
# TODO: reconcile them. This lexer should be expressible in re2c.

# TODO: Get rid of non-greedy match

for pat, tok_id in LEXER:
m = pat.match(self.s, self.pos)
if m:
Expand All @@ -254,6 +253,13 @@ def _Peek(self):
raise LexError(self.s, self.pos)
return Tok.Processing, pos + 2 # ?>

if tok_id == Tok.CDataBegin:
pos = self.s.find(']]>', self.pos)
if pos == -1:
# unterminated <![CDATA[
raise LexError(self.s, self.pos)
return Tok.CData, pos + 3 # ]]>

if tok_id == Tok.StartTag:
tag_name = m.group(1) # captured
if tag_name == 'script':
Expand Down Expand Up @@ -331,7 +337,8 @@ def ValidTokens(s, left_pos=0, right_pos=-1):

_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens

# TODO: capture tag name above?
# TODO: we don't need to capture the tag name here? That's done at the top
# level
_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)

# To match href="foo"
Expand Down
152 changes: 61 additions & 91 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,15 @@ def testAllAttrs(self):
self.assertEqual([('href', '?foo=1&amp;bar=2')], lex.AllAttrsRaw())


def Lex(h):
print(repr(h))
lex = html.ValidTokens(h)
tokens = list(lex)
for tok_id, end_pos in tokens:
log('%d %s', end_pos, html.TokenName(tok_id))
return tokens


class LexerTest(unittest.TestCase):

# IndexLinker in devtools/make_help.py
Expand All @@ -122,116 +131,77 @@ def testCommentParse2(self):
h = '''
hi <!-- line 1
line 2 --><br/>'''
print(repr(h))
lex = html.ValidTokens(h)

tok_id, pos = next(lex)
self.assertEqual(12, pos)
self.assertEqual(Tok.RawData, tok_id)

tok_id, pos = next(lex)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(50, pos)
self.assertEqual(Tok.Comment, tok_id)
tokens = Lex(h)

tok_id, pos = next(lex)
self.assertEqual(55, pos)
self.assertEqual(Tok.StartEndTag, tok_id)

tok_id, pos = next(lex)
self.assertEqual(55, pos)
self.assertEqual(Tok.EndOfStream, tok_id)
self.assertEqual(
[
(Tok.RawData, 12),
(Tok.Comment, 50), # <? err ?>
(Tok.StartEndTag, 55),
(Tok.EndOfStream, 55),
],
tokens)

def testProcessingInstruction(self):
# The TOP level should understand the <? ?> syntax, because otherwise
# it will be a start tag

# <?xml ?> header
Tok = html.Tok
h = 'hi <? err ?>'
print(repr(h))
lex = html.ValidTokens(h)

tok_id, pos = next(lex)
self.assertEqual(3, pos)
self.assertEqual(Tok.RawData, tok_id)
tokens = Lex(h)

tok_id, pos = next(lex)
self.assertEqual(12, pos)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(Tok.Processing, tok_id)

tok_id, pos = next(lex)
self.assertEqual(12, pos)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(Tok.EndOfStream, tok_id)
self.assertEqual(
[
(Tok.RawData, 3),
(Tok.Processing, 12), # <? err ?>
(Tok.EndOfStream, 12),
],
tokens)

def testScriptStyle(self):

Tok = html.Tok
h = '''
hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
</script>
'''
print(repr(h))
lex = html.ValidTokens(h)

tok_id, pos = next(lex)
self.assertEqual(12, pos)
self.assertEqual(Tok.RawData, tok_id)

# <script>
tok_id, pos = next(lex)
self.assertEqual(27, pos)
self.assertEqual(Tok.StartTag, tok_id)

# JavaScript code is CData
tok_id, pos = next(lex)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(78, pos)
self.assertEqual(Tok.CData, tok_id)

# </script>
tok_id, pos = next(lex)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(87, pos)
self.assertEqual(Tok.EndTag, tok_id)

def testValid(self):
tokens = Lex(h)

self.assertEqual(
[
(Tok.RawData, 12),
(Tok.StartTag, 27), # <script>
(Tok.HtmlCData, 78), # JavaScript code is HTML CData
(Tok.EndTag, 87), # </script>
(Tok.RawData, 96), # \n
(Tok.EndOfStream, 96), # \n
],
tokens)

def testCData(self):
Tok = html.Tok

lex = html.ValidTokens('<a>hi</a>')

tok_id, pos = next(lex)
self.assertEqual(3, pos)
self.assertEqual(Tok.StartTag, tok_id)
# from
# /home/andy/src/languages/Python-3.11.5/Lib/test/xmltestdata/c14n-20/inC14N4.xml
h = '<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'
tokens = Lex(h)

tok_id, pos = next(lex)
self.assertEqual(5, pos)
self.assertEqual(Tok.RawData, tok_id)
self.assertEqual([
(Tok.StartTag, 9),
(Tok.CData, 61),
(Tok.EndTag, 71),
(Tok.EndOfStream, 71),
], tokens)

tok_id, pos = next(lex)
self.assertEqual(9, pos)
self.assertEqual(Tok.EndTag, tok_id)

tok_id, pos = next(lex)
self.assertEqual(9, pos)
self.assertEqual(Tok.EndOfStream, tok_id)

lex = html.Lexer('<a>hi</a>')
while True:
tok_id, pos = lex.Read()
print('%d %s' % (pos, html.TokenName(tok_id)))
if tok_id == Tok.EndOfStream:
break
def testStartTag(self):
Tok = html.Tok

return
tok_id, pos = next(lex)
self.assertEqual(9, pos)
self.assertEqual(Tok.EndOfStream, tok_id)
h = '<a>hi</a>'
tokens = Lex(h)

while True:
tok_id, pos = next(lex)
print('%d %s' % (pos, html.TokenName(tok_id)))
self.assertEqual([
(Tok.StartTag, 3),
(Tok.RawData, 5),
(Tok.EndTag, 9),
(Tok.EndOfStream, 9),
], tokens)

def testInvalid(self):
Tok = html.Tok
Expand Down

0 comments on commit 43084cf

Please sign in to comment.