Skip to content

Commit

Permalink
[lazylex/html] Tweak lexing rules to recognize the tag
Browse files Browse the repository at this point in the history
We have to treat <script> <style> differently.  That's still TODO.
  • Loading branch information
Andy C committed Jan 10, 2025
1 parent 57fd999 commit bafbe3b
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 11 deletions.
2 changes: 1 addition & 1 deletion doc/table-object-doc.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ This is part of **maximal** YSH!
- you can escape everything, so you can put another HTM8 doc inside
- and you can put JSON/JSON8 or TSV/TSV8
- although are there whitespace rules?
- all nodes can be liek `<pre>` nodes, preserving whitespace, until
- all nodes can be like `<pre>` nodes, preserving whitespace, until
- you apply another function to it

### HTML5 whitespace rules
Expand Down
33 changes: 23 additions & 10 deletions lazylex/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,9 @@ def MakeLexer(rules):
#
# EntityRef = / '&' dot{* N} ';' /

# Tag name, or attribute name
_NAME = r'[a-zA-Z][a-zA-Z0-9_\-]*' # must start with letter

LEXER = [
# Note non-greedy matches are regular and can be matched in linear time
# with RE2.
Expand Down Expand Up @@ -168,18 +171,30 @@ def MakeLexer(rules):

# NOTE: < is allowed in these.
(r'<! [^>]+ >', Tok.Decl), # <!DOCTYPE html>
(r'<(?:script|style) [^>]+>', Tok.CDataStartTag), # start <a>
(r'</ [^>]+ >', Tok.EndTag), # self-closing <br/> comes FIRST
(r'< [^>]+ />', Tok.StartEndTag), # end </a>
(r'< [^>]+ >', Tok.StartTag), # start <a>
#(r'<(?:script|style) [^>]+>', Tok.CDataStartTag), # start <a>

# Notes:
# - We look for a valid tag name, but we don't validate attributes.
# That's done in the tag lexer.
# - We don't allow leading whitespace
#
# TODO: do something different for <script> and <style>. And maybe have a
# mode to also understand the difference between <pre> <textarea> and say
# <div>.
(r'</ (%s) [^>]* >' % _NAME, Tok.EndTag), # self-closing <br/> comes FIRST
(r'< (%s) [^>]* />' % _NAME, Tok.StartEndTag), # end </a>
(r'< (%s) [^>]* >' % _NAME, Tok.StartTag), # start <a>

(r'&\# [0-9]+ ;', Tok.DecChar),
(r'&\# x[0-9a-fA-F]+ ;', Tok.HexChar),
(r'& [a-zA-Z]+ ;', Tok.CharEntity),

# HTML5 allows > in raw data - should we? It's apparently not allowed in
# XML.
# But < is not allowed.
# HTML5 allows > in raw data - should we? But < is not allowed.
# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
#
# TODO: I think we should disallow it, like XML does. There should be "one
# way to do it". There is a stronger distinction between <script> <style>
# this way.
(r'[^&<]+', Tok.RawData),
(r'.', Tok.Invalid), # error!
]
Expand Down Expand Up @@ -305,11 +320,9 @@ def ValidTokens(s, left_pos=0, right_pos=-1):
#
# Allow - for td-attrs

# Tag name, or attribue name
_NAME = r'[a-zA-Z][a-zA-Z0-9_\-]*' # must start with letter

_ATTR_VALUE = r'[a-zA-Z0-9_\-]+' # allow hyphens

# TODO: capture tag name above?
_TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)

# To match href="foo"
Expand Down
2 changes: 2 additions & 0 deletions lazylex/html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,8 @@ def testScriptStyle(self):
self.assertEqual(12, pos)
self.assertEqual(Tok.RawData, tok_id)

return

# <script>
tok_id, pos = next(lex)
self.assertEqual(27, pos)
Expand Down

0 comments on commit bafbe3b

Please sign in to comment.