Skip to content

Commit

Permalink
[lazylex] Working on <script> <style>
Browse files Browse the repository at this point in the history
The data inside them is CDATA.  This is actually a lexer mode ...

We just skip everything until </script> </style>.

There can't be more tags between them.  We also allow unescaped < > &,
although we also have to respect &lt;script> too.

I noticed that > is allowed in raw data, but not <.  Hm.
  • Loading branch information
Andy C committed Jan 7, 2025
1 parent 39ea432 commit 09e8d9a
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 34 deletions.
16 changes: 14 additions & 2 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def Print(self, s):

# HTML Tokens
# CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData Invalid EndOfStream'.split(
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData CDataStartTag CDataEndTag Invalid EndOfStream'.split(
)


Expand Down Expand Up @@ -168,14 +168,17 @@ def MakeLexer(rules):

# NOTE: < is allowed in these.
(r'<! [^>]+ >', Tok.Decl), # <!DOCTYPE html>
(r'<(?:script|style) [^>]+>', Tok.CDataStartTag), # start <a>
(r'</ [^>]+ >', Tok.EndTag), # self-closing <br/> comes FIRST
(r'< [^>]+ />', Tok.StartEndTag), # end </a>
(r'< [^>]+ >', Tok.StartTag), # start <a>
(r'&\# [0-9]+ ;', Tok.DecChar),
(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
(r'& [a-zA-Z]+ ;', Tok.CharEntity),

# Note: > is allowed in raw data.
# HTML5 allows > in raw data - should we? It's apparently not allowed in
# XML.
# But < is not allowed.
# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
(r'[^&<]+', Tok.RawData),
(r'.', Tok.Invalid), # error!
Expand Down Expand Up @@ -225,6 +228,15 @@ def _Peek(self):
raise LexError(self.s, self.pos)
return Tok.Processing, pos + 2 # ?>

# TODO: we need to enter state so the NEXT call can be CData
# And then the one after that must be CDataEndTag.
if tok_id == Tok.CDataStartTag:
end_tag = '</script>'
pos = self.s.find(end_tag, self.pos)
if pos == -1:
# unterminated </script>
raise LexError(self.s, self.pos)

return tok_id, m.end()
else:
raise AssertionError('Tok.Invalid rule should have matched')
Expand Down
84 changes: 52 additions & 32 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,39 @@ def testProcessingInstruction(self):
log('tok %r', html.TokenName(tok_id))
self.assertEqual(Tok.EndOfStream, tok_id)

def testScriptStyle(self):

Tok = html.Tok
h = '''
hi <script src=""> if (x < 1 && y > 2 ) { console.log(""); }
</script>
'''
print(repr(h))
lex = html.ValidTokens(h)

tok_id, pos = next(lex)
self.assertEqual(12, pos)
self.assertEqual(Tok.RawData, tok_id)

# <script>
tok_id, pos = next(lex)
self.assertEqual(27, pos)
self.assertEqual(Tok.CDataStartTag, tok_id)

return

# JavaScript code is CData
tok_id, pos = next(lex)
self.assertEqual(34, pos)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(Tok.CData, tok_id)

# </script>
tok_id, pos = next(lex)
self.assertEqual(27, pos)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(Tok.CDataEndTag, tok_id)

def testValid(self):
Tok = html.Tok

Expand Down Expand Up @@ -205,38 +238,25 @@ def testValid(self):
def testInvalid(self):
Tok = html.Tok

lex = html.ValidTokens('<a>&')

tok_id, pos = next(lex)
self.assertEqual(3, pos)
self.assertEqual(Tok.StartTag, tok_id)

try:
tok_id, pos = next(lex)
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')

# Comment
lex = html.ValidTokens('<!-- unfinished comment')

try:
tok_id, pos = next(lex)
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')

# Processing
lex = html.ValidTokens('<? unfinished processing')

try:
tok_id, pos = next(lex)
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')
INVALID = [
# Should be &amp;
'<a>&',
# Hm > is allowed?
#'a > b',
'a < b',
'<!-- unfinished comment',
'<? unfinished processing',
]

for s in INVALID:
lex = html.ValidTokens(s)
try:
for i in xrange(10):
tok_id, pos = next(lex)
except html.LexError as e:
print(e)
else:
self.fail('Expected LexError')


if __name__ == '__main__':
Expand Down

0 comments on commit 09e8d9a

Please sign in to comment.