Skip to content

Commit

Permalink
[lazylex/html] Special lexing rules for <script> and <style>
Browse files Browse the repository at this point in the history
This works well on our corpus!

Also started testing XML.

Add a couple TODOs:

- tighten up end tags
- disallow unescaped <
  • Loading branch information
Andy C committed Jan 10, 2025
1 parent bafbe3b commit 5ec0ce7
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 29 deletions.
16 changes: 16 additions & 0 deletions data_lang/htm8-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,22 @@ test-wwz() {
popd
}

find-xml() {
time find ~ -iname '*.xml' | tee _tmp/xml-files.txt
}

test-other-xml() {
# problem with &ent1;
# CDATA support! haha OK
time cat _tmp/xml-files.txt | $REPO_ROOT/$0 ht8-tool well-formed
}

test-repo-xml() {
# OK these parse
time find . -name '_chroot' -a -prune -o -name '*.xml' -a -print \
| $REPO_ROOT/$0 ht8-tool well-formed
}

# OK we have to skip the <script> tag! And <style>
#
# document.location = '#' + params.join('&');
Expand Down
49 changes: 28 additions & 21 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
import sys

if sys.version_info.major == 2:
from typing import List, Tuple
from typing import List, Tuple, Optional


def log(msg, *args):
Expand Down Expand Up @@ -81,7 +81,7 @@ def Print(self, s):

# HTML Tokens
# CommentBegin and ProcessingBegin are "pseudo-tokens", not visible
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData CDataStartTag CDataEndTag Invalid EndOfStream'.split(
TOKENS = 'Decl Comment CommentBegin Processing ProcessingBegin StartTag StartEndTag EndTag DecChar HexChar CharEntity RawData CData Invalid EndOfStream'.split(
)


Expand Down Expand Up @@ -174,31 +174,28 @@ def MakeLexer(rules):
#(r'<(?:script|style) [^>]+>', Tok.CDataStartTag), # start <a>

# Notes:
# - We look for a valid tag name, but we don't validate attributes.
# - We look for a valid tag name, but we don't validate attributes.
# That's done in the tag lexer.
# - We don't allow leading whitespace
#
# TODO: do something different for <script> and <style>. And maybe have a
# mode to also understand the difference between <pre> <textarea> and say
# <div>.
(r'</ (%s) [^>]* >' % _NAME, Tok.EndTag), # self-closing <br/> comes FIRST
(r'</ (%s) [^>]* >' % _NAME, Tok.EndTag),
# self-closing <br/> comes before StarttTag
(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>

(r'&\# [0-9]+ ;', Tok.DecChar),
(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
(r'& [a-zA-Z]+ ;', Tok.CharEntity),

# HTML5 allows > in raw data - should we? But < is not allowed.
# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
#
# TODO: I think we should disallow it, like XML does. There should be "one
# way to do it". There is a stronger distinction between <script> <style>
# this way.
(r'[^&<]+', Tok.RawData),
(r'.', Tok.Invalid), # error!
]

# TODO:
# - I think we should unescaped <, like XML does. There should be "one way to
# do it", and it should catch bugs
# - end tags shouldn't allow any other data, it has to be </foo>, not </foo x=y>

LEXER = MakeLexer(LEXER)


Expand All @@ -210,6 +207,9 @@ def __init__(self, s, left_pos=0, right_pos=-1):
self.right_pos = len(s) if right_pos == -1 else right_pos
self.cache = {} # string -> compiled regex pattern object

# either </script> or </style> - we search until we see that
self.search_state = None # type: Optional[str]

def _Peek(self):
# type: () -> Tuple[int, int]
"""
Expand All @@ -220,6 +220,15 @@ def _Peek(self):

assert self.pos < self.right_pos, self.pos

if self.search_state is not None:
pos = self.s.find(self.search_state, self.pos)
if pos == -1:
# unterminated <script> or <style>
raise LexError(self.s, self.pos)
self.search_state = None
# beginning
return Tok.CData, pos

# Find the first match.
# Note: frontend/match.py uses _LongestMatch(), which is different!
# TODO: reconcile them. This lexer should be expressible in re2c.
Expand All @@ -243,14 +252,12 @@ def _Peek(self):
raise LexError(self.s, self.pos)
return Tok.Processing, pos + 2 # ?>

# TODO: we need to enter state so the NEXT call can be CData
# And then the one after that must be CDataEndTag.
if tok_id == Tok.CDataStartTag:
end_tag = '</script>'
pos = self.s.find(end_tag, self.pos)
if pos == -1:
# unterminated </script>
raise LexError(self.s, self.pos)
if tok_id == Tok.StartTag:
tag_name = m.group(1) # captured
if tag_name == 'script':
self.search_state = '</script>'
elif tag_name == 'style':
self.search_state = '</style>'

return tok_id, m.end()
else:
Expand Down
12 changes: 4 additions & 8 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,26 +179,22 @@ def testScriptStyle(self):
self.assertEqual(12, pos)
self.assertEqual(Tok.RawData, tok_id)

return

# <script>
tok_id, pos = next(lex)
self.assertEqual(27, pos)
self.assertEqual(Tok.CDataStartTag, tok_id)

return
self.assertEqual(Tok.StartTag, tok_id)

# JavaScript code is CData
tok_id, pos = next(lex)
self.assertEqual(34, pos)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(78, pos)
self.assertEqual(Tok.CData, tok_id)

# </script>
tok_id, pos = next(lex)
self.assertEqual(27, pos)
log('tok %r', html.TokenName(tok_id))
self.assertEqual(Tok.CDataEndTag, tok_id)
self.assertEqual(87, pos)
self.assertEqual(Tok.EndTag, tok_id)

def testValid(self):
Tok = html.Tok
Expand Down

0 comments on commit 5ec0ce7

Please sign in to comment.