[lazylex/html] Start conversion to XML

Can correct > to & This will help refine the TagLexer and AttrValueLexer APIs as well. Although it's looking like significant work too.
oils-for-unix · Jan 13, 2025 · 03e5cd0 · 03e5cd0
1 parent 4b795d5
commit 03e5cd0
Show file tree

Hide file tree

Showing 2 changed files with 62 additions and 10 deletions.
diff --git a/lazylex/html.py b/lazylex/html.py
@@ -919,18 +919,68 @@ def Validate(contents, flags, counters):
     counters.num_tokens += len(tokens)
 
 
-def ToXml(h):
+def ToXml(htm8_str):
     # type: (str) -> str
 
-    # TODO: 
+    # TODO:
     # 1. Lex it
     # 2. < & > must be escaped
     #    a. in raw data
     #    b. in quoted strings
     # 3. <script> turned into CDATA
     # 4. void tags turned into self-closing tags
     # 5. case-sensitive tag matching - not sure about this
-    return h
+
+    tag_lexer = TagLexer(htm8_str)
+    val_lexer = AttrValueLexer(htm8_str)
+
+    f = StringIO()
+    out = Output(htm8_str, f)
+
+    lx = Lexer(htm8_str)
+
+    pos = 0
+    while True:
+        tok_id, end_pos = lx.Read()
+
+        if tok_id == Tok.Invalid:
+            raise LexError(htm8_str, pos)
+        if tok_id == Tok.EndOfStream:
+            break
+
+        if tok_id in (Tok.RawData, Tok.CharEntity, Tok.HexChar, Tok.DecChar):
+            out.PrintUntil(end_pos)
+        elif tok_id in (Tok.StartTag, Tok.StartEndTag):
+            tag_lexer.Reset(pos, end_pos)
+            # TODO: reduce allocations here
+            all_attrs = tag_lexer.AllAttrsRawSlice()
+            for name, val_start, val_end in all_attrs:
+                val_lexer.Reset(val_start, val_end)
+                # TODO: get the kind of string
+                #
+                # Quoted:   we need to replace & with &amp; and < with &lt;
+                #           note > is not allowed
+                # Unquoted: right now, we can just surround with double quotes
+                #           because we don't allow any bad chars
+                # Empty   : add "", so empty= becomes =""
+                # Missing : add ="", so missing becomes missing=""
+
+            tag_name = lx.CanonicalTagName()
+            if tok_id == Tok.StartTag and tag_name in VOID_ELEMENTS:
+                # TODO: instead of closing >, print />
+                pass
+
+        elif tok_id == Tok.BadAmpersand:
+            #out.SkipTo(pos)
+            out.Print('&amp;')
+            out.SkipTo(end_pos)
+        else:
+            out.PrintUntil(end_pos)
+
+        pos = end_pos
+
+    out.PrintTheRest()
+    return f.getvalue()
 
 
 class Counters(object):

diff --git a/lazylex/html_test.py b/lazylex/html_test.py
@@ -370,14 +370,18 @@ def testValid(self):
     '<STYLEz><</STYLEz>',
 ]
 
+SKIP = 0
+UNCHANGED = 1
+
 VALID_LEX = [
     # TODO: convert these to XML
-    ('<foo></foo>', ''),
+    ('<foo></foo>', UNCHANGED),
     ('<foo x=y></foo>', ''),
+    #('<foo x="&"></foo>', '<foo x="&amp;"></foo>'),
     ('<foo x="&"></foo>', ''),
 
     # Allowed with BadAmpersand
-    ('<p> x & y </p>', ''),
+    ('<p> x & y </p>', '<p> x &amp; y </p>'),
 ]
 
 INVALID_PARSE = [
@@ -386,20 +390,16 @@ def testValid(self):
     '<meta></meta>',  # this is a self-closing tag
 ]
 
-SKIP = 0
-UNCHANGED = 1
-
 VALID_PARSE = [
     ('<!DOCTYPE html>\n', ''),
     ('<!DOCTYPE>', ''),
 
     # empty strings
     ('<p x=""></p>', UNCHANGED),
     ("<p x=''></p>", UNCHANGED),
-
     ('<self-closing a="b" />', UNCHANGED),
 
-    # We could also normalize CDATA? 
+    # We could also normalize CDATA?
     # Note that CDATA has an escaping problem: you need to handle it ]]> with
     # concatenation.  It just "pushes the problem around".
     # So I think it's better to use ONE kind of escaping, which is &lt;
@@ -410,7 +410,9 @@ def testValid(self):
     # allowed, but 3 > 4 is not allowed
     ('<p x="3 < 4"></p>', ''),
     ('<b><a href="foo">link</a></b>', ''),
+    #('<meta><a></a>', '<meta/><a></a>'),
     ('<meta><a></a>', ''),
+
     # no attribute
     ('<button disabled></button>', ''),
     ('<button disabled=></button>', ''),