[lazylex/html] Disallow any text after the end tag name.

Running all the tests again. This means we need : in tag names, for XML. Also test out the rule of disallowing > in regular data. That unfortunately showed 3 errors in my early blog posts, and also in the .wwz files. Not sure if will disallow this.
oils-for-unix · Jan 10, 2025 · 089f900 · 089f900
1 parent 5ec0ce7
commit 089f900
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 10 deletions.
diff --git a/data_lang/htm8-test.sh b/data_lang/htm8-test.sh
@@ -2,6 +2,22 @@
 #
 # Usage:
 #   data_lang/htm8-test.sh
+#
+# TODO:
+# - Rename to DML8?  Because it can handle XML
+# - CDATA in XML, which is not a script
+#
+# Operations / Levels:
+#
+# - Lexing
+#   - lex-tags
+#   - lex-attrs - validate all Start tags, all StartEnd tags
+#   - lex-quoted-values - unescaping, etc.
+#     - are there invalid entities?
+# - Parsing
+#   - well-formed / tag balance check
+# - Schema
+#   - not sure if we check the HTML schema or not - it might be too restrictive
 
 REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
 

diff --git a/lazylex/html.py b/lazylex/html.py
@@ -139,7 +139,8 @@ def MakeLexer(rules):
 # EntityRef = / '&' dot{* N} ';' /
 
 # Tag name, or attribute name
-_NAME = r'[a-zA-Z][a-zA-Z0-9_\-]*'  # must start with letter
+# colon is used in XML
+_NAME = r'[a-zA-Z][a-zA-Z0-9:_\-]*'  # must start with letter
 
 LEXER = [
     # Note non-greedy matches are regular and can be matched in linear time
@@ -165,19 +166,18 @@ def MakeLexer(rules):
     #
     #   https://developer.mozilla.org/en-US/docs/Web/API/ProcessingInstruction
     #
-    # We don't want to confuse them with start tags, so we recognize them at
-    # the top level.
+    # They are used for the XML comment:
+    # <?xml version="1.0" encoding="UTF-8"?>
     (r'<\?', Tok.ProcessingBegin),
 
-    # NOTE: < is allowed in these.
+    # NOTE: < is allowed in these?
     (r'<! [^>]+ >', Tok.Decl),  # <!DOCTYPE html>
-    #(r'<(?:script|style) [^>]+>', Tok.CDataStartTag),  # start <a>
 
     # Notes:
     # - We look for a valid tag name, but we don't validate attributes.
     #   That's done in the tag lexer.
     # - We don't allow leading whitespace
-    (r'</ (%s) [^>]* >' % _NAME, Tok.EndTag),
+    (r'</ (%s) >' % _NAME, Tok.EndTag),
     # self-closing <br/>  comes before StarttTag
     (r'<  (%s) [^>]* />' % _NAME, Tok.StartEndTag),  # end </a>
     (r'<  (%s) [^>]* >' % _NAME, Tok.StartTag),  # start <a>
@@ -187,14 +187,16 @@ def MakeLexer(rules):
 
     # HTML5 allows > in raw data - should we?  But < is not allowed.
     # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
+    #
+    # - My early blog has THREE errors when disallowing >
+    # - So do some .wwz files
     (r'[^&<]+', Tok.RawData),
     (r'.', Tok.Invalid),  # error!
 ]
 
 # TODO:
-# - I think we should unescaped <, like XML does.  There should be "one way to
-#   do it", and it should catch bugs
-# - end tags shouldn't allow any other data, it has to be </foo>, not </foo x=y>
+# - should we disallowed unescaped >, like XML does?  There should be "one way to
+#   do it", and it could catch escaping bugs
 
 LEXER = MakeLexer(LEXER)
 

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -244,12 +244,13 @@ def testInvalid(self):
             'a < b',
             '<!-- unfinished comment',
             '<? unfinished processing',
+            '</div bad=attr> <a> <b>',
         ]
 
         for s in INVALID:
             lex = html.ValidTokens(s)
             try:
-                for i in xrange(10):
+                for i in xrange(5):
                     tok_id, pos = next(lex)
             except html.LexError as e:
                 print(e)