[lazylex/html] Add test cases

For XML mode too. [doc] Start HTM8 definition
oils-for-unix · Jan 11, 2025 · 71c791e · 71c791e
1 parent 7f7bd39
commit 71c791e
Show file tree

Hide file tree

Showing 5 changed files with 260 additions and 30 deletions.
diff --git a/build/doc.sh b/build/doc.sh
@@ -101,6 +101,7 @@ readonly MARKDOWN_DOCS=(
   qsn
   qtt
   j8-notation
+  htm8
   # Protocol
   pretty-printing
   stream-table-process

diff --git a/doc/htm8.md b/doc/htm8.md
@@ -0,0 +1,164 @@
+---
+in_progress: yes
+default_highlighter: oils-sh
+---
+
+HTM8 - Efficient HTML with Errors
+=================================
+
+- Syntax Errors: It's a Subset
+- Efficient
+  - Easy to Remember
+  - Easy to Implement
+  - Runs Efficiently - you don't have to materialize a big DOM tree, which
+    causes many allocations
+
+<div id="toc">
+</div> 
+
+## Basic Structure
+
+### Text Content
+
+Anything except `&` and `<`.
+
+These must be `&amp;` and `&lt;`.
+
+`>` is allowed, or you can escape it with `&gt;`.
+
+### 3 Kinds of Character Code
+
+1. `&amp;` - named
+1. `&#999;` - decimal
+1. `&#xff;` - hex
+
+### 3 Kinds of Tag
+
+1. Start
+1. End
+1. StartEnd
+
+### 2 Kinds of Attribute
+
+1. Unquoted
+1. Quoted
+
+### 2 Kinds of Comment
+
+1. `<!-- -->`
+1. `<? ?>` (XML processing instruction)
+
+
+## Special Rules, From HTML
+
+### 2 Tags Cause Special Lexing
+
+- `<script> <style>`
+
+Note: we still have CDATA for compatibility.
+
+
+### 16 VOID Tags Change Parsing
+
+- `<source> ...`
+
+### Bonus: XML Mode
+
+- Get rid  of the 2 special lexing tags, and 16 VOID tags
+
+Then you can query HTML
+
+
+## Under the Hood
+
+### 3 Layers of Lexing
+
+1. Tag
+1. Attributes within a Tag
+1. Quoted Value for Attributes
+
+## What Do You Use This for?
+
+- Stripping comments
+- Adding TOC
+- Syntax highlighting code
+- Adding links shortcuts
+- ul-table
+
+TODO:
+
+- DOM API  on top of it
+  - node.elementsByTag('p')
+  - node.elementsByClassName('left')
+  - node.elementByID('foo')
+  - innerHTML() outerHTML()
+  - tag attrs
+  - low level:
+    - outerLeft, outerRight, innerLeft, innerRight
+- CSS Selectors - `querySelectorAll()`
+- sed-like model
+
+## Algorithms
+
+### Emitting HTM8 as HTML5
+
+Just emit it!  This always works, by design.
+
+### Parsing XML
+
+- Set `NO_SPECIAL_TAGS`
+
+### Converting to XML?
+
+- Always quote all attributes
+- Always quote `>` - are we alloxing this in HX8?
+- Do something with `<script>` and `<style>`
+  - I guess turn them into normal tags, with escaping?
+  - Or maybe just disallow them?
+- Maybe validate any other declarations, like `<!DOCTYPE foo>`
+- Add XML header `<?xml version=>`, remove `<!DOCTYPE html>`
+
+## Related
+
+- [ysh-doc-processing.html](ysh-doc-processing.html)
+- [table-object-doc.html](table-object-doc.html)
+
+## FAQ
+
+### What Doesn't This Cover?
+
+- single-quoted attributes?
+  - We should probably add those, it shouldn't be hard?
+
+- Encodings other than UTF-8.  HTM8 is always UTF-8.
+- Unicode Tag names and attribute names.
+  - This is allowed in HTML5 and XML.
+  - We leave those out for simpler lexing.  Text and attribute values may be unicode.
+
+There are 5 kinds of tags:
+
+- Normal HTML tags
+- RCDATA for `<title> <textarea>`
+- RAWTEXT `<style> <xmp> <iframe>` ?
+
+and we have
+
+- CDATA `<script>`
+  - TODO: we need a test case for `</script>` in a string literal?
+- Foreign `<math> <svg>` - XML rules
+
+## TODO
+
+- `<svg>` and `<math>` are foreign XML content?  Doh
+  - So I can just switch to XML mode in that case
+  - TODO: we need a test corpus for this!
+  - maybe look for wikipedia content
+- can we also just disallow these?  Can you make these into external XML files?
+
+This is one way:
+
+    <object data="math.xml" type="application/mathml+xml"></object>
+    <object data="drawing.xml" type="image/svg+xml"></object>
+
+Then we don't need special parsing?
+
diff --git a/doc/ysh-doc-processing.md b/doc/ysh-doc-processing.md
@@ -134,22 +134,7 @@ Safe HTML subset
 
 If you want to take user HTML, then you first use an HTML5 -> HT8 converter.
 
-## Algorithms
-
-### Emitting HX8 as HTML5
-
-Just emit it!  This always work.
-
-### Converting HX8 to XML
-
-- Always quote all attributes
-- Always quote `>` - are we alloxing this in HX8?
-- Do something with `<script>` and `<style>`
-  - I guess turn them into normal tags, with escaping?
-  - Or maybe just disallow them?
-- Maybe validate any other declarations, like `<!DOCTYPE foo>`
-- Add XML header `<?xml version=>`, remove `<!DOCTYPE html>`
-
 ## Related
 
 - [table-object-doc.html](table-object-doc.html)
+- [htm8.html](htm8.html)
diff --git a/lazylex/html.py b/lazylex/html.py
@@ -448,6 +448,7 @@ def ValidTokenList(s, no_special_tags=False):
 _TAG_RE = re.compile(r'/? \s* (%s)' % _NAME, re.VERBOSE)
 
 # To match href="foo"
+# Note: in HTML5 and XML, single quoted attributes are also valid
 
 _ATTR_RE = re.compile(
     r'''
@@ -739,13 +740,15 @@ def Validate(contents, flags, counters):
             tag_lexer.Reset(start_pos, end_pos)
             all_attrs = tag_lexer.AllAttrsRaw()
             counters.num_attrs += len(all_attrs)
+            counters.debug_attrs.extend(all_attrs)
 
         elif tok_id == Tok.StartTag:
             counters.num_start_tags += 1
 
             tag_lexer.Reset(start_pos, end_pos)
             all_attrs = tag_lexer.AllAttrsRaw()
             counters.num_attrs += len(all_attrs)
+            counters.debug_attrs.extend(all_attrs)
 
             if flags & BALANCED_TAGS:
                 tag_name = lx.TagName()
@@ -776,6 +779,13 @@ def Validate(contents, flags, counters):
                         start_pos=start_pos)
 
         start_pos = end_pos
+
+    if len(tag_stack) != 0:
+        raise ParseError('Missing closing tags at end of doc: %s' %
+                         ' '.join(tag_stack),
+                         s=contents,
+                         start_pos=start_pos)
+
     counters.num_tokens += len(tokens)
 
 
@@ -788,6 +798,8 @@ def __init__(self):
         self.num_attrs = 0
         self.max_tag_stack = 0
 
+        self.debug_attrs = []
+
 
 def main(argv):
     action = argv[1]

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -279,20 +279,7 @@ def testStartTag(self):
     def testInvalid(self):
         Tok = html.Tok
 
-        INVALID = [
-            # Should be &amp;
-            '<a>&',
-            '&amp',  # not finished
-            '&#',  # not finished
-            # Hm > is allowed?
-            #'a > b',
-            'a < b',
-            '<!-- unfinished comment',
-            '<? unfinished processing',
-            '</div bad=attr> <a> <b>',
-        ]
-
-        for s in INVALID:
+        for s in INVALID_LEX:
             try:
                 tokens = html.ValidTokenList(s)
             except html.LexError as e:
@@ -301,5 +288,86 @@ def testInvalid(self):
                 self.fail('Expected LexError')
 
 
+INVALID_LEX = [
+    # Should be &amp;
+    '<a>&',
+    '&amp',  # not finished
+    '&#',  # not finished
+    # Hm > is allowed?
+    #'a > b',
+    'a < b',
+    '<!-- unfinished comment',
+    '<? unfinished processing',
+    '</div bad=attr> <a> <b>',
+    # TODO: we should match up to > or />
+    #'<a foo=bar !></a>',  # bad attr
+    #'<a zz></a>',  # this is not invalid?
+
+    # TODO: should be escaped, invalid in XML
+    #'<a href="&"></a>',
+    #'<a href=">"></a>',
+]
+
+INVALID_PARSE = [
+    '<a></b>',
+    '<a>',  # missing closing tag
+    '<meta></meta>',  # this is a self-closing tag
+]
+
+VALID_PARSE = [
+    '<b><a href="foo">link</a></b>',
+    '<meta><a></a>',
+    # TODO: capitalization should be allowed
+    #'<META><a></a>',
+
+    # TODO:
+    #'<a foo="&"></a>',  # bad attr
+    #'<a foo=bar !></a>',  # bad attr
+    #'<a zz></a>',  # bad attr
+
+    # TODO: Test <svg> and <math> ?
+]
+
+VALID_XML = [
+    '<meta></meta>',
+]
+
+
+class ValidateTest(unittest.TestCase):
+
+    def testInvalid(self):
+        counters = html.Counters()
+        for s in INVALID_LEX:
+            try:
+                html.Validate(s, html.BALANCED_TAGS, counters)
+            except html.LexError as e:
+                print(e)
+            else:
+                self.fail('Expected LexError %r' % s)
+
+        for s in INVALID_PARSE:
+            try:
+                html.Validate(s, html.BALANCED_TAGS, counters)
+            except html.ParseError as e:
+                print(e)
+            else:
+                self.fail('Expected ParseError')
+
+    def testValid(self):
+        counters = html.Counters()
+        for s in VALID_PARSE:
+            html.Validate(s, html.BALANCED_TAGS, counters)
+            print('HTML5 %r' % s)
+            print('HTML5 attrs %r' % counters.debug_attrs)
+
+    def testValidXml(self):
+        counters = html.Counters()
+        for s in VALID_XML:
+            html.Validate(s, html.BALANCED_TAGS | html.NO_SPECIAL_TAGS,
+                          counters)
+            print('XML %r' % s)
+            print('XML attrs %r' % counters.debug_attrs)
+
+
 if __name__ == '__main__':
     unittest.main()