diff --git a/doc/htm8.md b/doc/htm8.md
index a21290104..cd8dd6b5e 100644
--- a/doc/htm8.md
+++ b/doc/htm8.md
@@ -3,15 +3,32 @@ in_progress: yes
 default_highlighter: oils-sh
 ---
 
-HTM8 - Efficient HTML with Errors
+HTM8 - An Easy Subset of HTML5, With Errors
 =================================
 
 - Syntax Errors: It's a Subset
-- Efficient
+- Easy
   - Easy to Remember
   - Easy to Implement
   - Runs Efficiently - you don't have to materialize a big DOM tree, which
     causes many allocations
+- Convertable to XML?
+  - without allocations, with a `sed`-like transformation!
+  - low level lexing and matching
+
+<!--
+
+TODO: 99.9% of HTML documents from CommonCrawl should be convertible to XML,
+and then validated by an XML parser
+
+- lxml - this is supposed to be high quality
+
+- Python stdlib uses expat - https://libexpat.github.io/
+
+- Gah it's this huge thing, 8K lines: https://github.com/libexpat/libexpat/blob/master/expat/lib/xmlparse.c
+  - do they have the billion laughs bug?
+
+-->
 
 <div id="toc">
 </div> 
@@ -125,11 +142,18 @@ Conflicts between HTML5 and XML:
 
 ### Converting to XML?
 
-- Always quote all attributes
-- Always quote `>` - are we alloxing this in HX8?
-- Do something with `<script>` and `<style>`
-  - I guess turn them into normal tags, with escaping?
-  - Or maybe just disallow them?
+- Add quotes to unquoted attributes
+  - single and double quotes stay the same?
+- Quote special chars
+  - & BadAmpersand -> `&amp;`
+  - < BadLessThan -> `&lt;`
+  - > BadGreaterTnan -> `&gt;`
+- `<script>` and `<style>`
+  - either add `<![CDATA[`
+  - or simply escape their values with `&amp; &lt;`
+- what to do about case-insensitive tags?
+  - maybe you can just normalize them
+  - because we do strict matching
 - Maybe validate any other declarations, like `<!DOCTYPE foo>`
 - Add XML header `<?xml version=>`, remove `<!DOCTYPE html>`
 
@@ -157,6 +181,11 @@ This makes lexing the top-level structure easier.
 
 ### What Doesn't This Cover?
 
+- HTM8 tags must be balanced to convert them to XML
+
+- NUL bytes aren't allowed - currently due to re2c sentinel
+  - Although I think we could have the preprocessing pass to convert it to the
+    Unicode replacement char?  I think that HTML might mandate that
 - Encodings other than UTF-8.  HTM8 is always UTF-8.
 - Unicode Tag names and attribute names.
   - This is allowed in HTML5 and XML.
diff --git a/lazylex/html.py b/lazylex/html.py
index 76fcd3db9..4014d417e 100755
--- a/lazylex/html.py
+++ b/lazylex/html.py
@@ -283,6 +283,10 @@ def _Peek(self):
 
         if self.search_state is not None and not self.no_special_tags:
             # TODO: case-insensitive search for </SCRIPT> <SCRipt> ?
+            #
+            # Another strategy: enter a mode where we find ONLY the end tag
+            # regex, and any data that's not <, and then check the canonical
+            # tag name for 'script' or 'style'.
             pos = self.s.find(self.search_state, self.pos)
             if pos == -1:
                 # unterminated <script> or <style>
@@ -328,10 +332,11 @@ def _Peek(self):
                     return Tok.CData, pos + 3  # ]]>
 
                 if tok_id == Tok.StartTag:
-                    if self.TagNameEquals('script'):
-                        self.search_state = '</script>'
-                    elif self.TagNameEquals('style'):
-                        self.search_state = '</style>'
+                    # TODO: reduce allocations
+                    if (self.TagNameEquals('script') or
+                            self.TagNameEquals('style')):
+                        # <SCRipt a=b>  -> </SCRipt>
+                        self.search_state = '</' + self._LiteralTagName() + '>'
 
                 return tok_id, m.end()
         else:
@@ -346,12 +351,16 @@ def TagNameEquals(self, expected):
         # directly?
         return expected == self.CanonicalTagName()
 
-    def CanonicalTagName(self):
+    def _LiteralTagName(self):
         # type: () -> None
         assert self.tag_pos_left != -1, self.tag_pos_left
         assert self.tag_pos_right != -1, self.tag_pos_right
 
-        tag_name = self.s[self.tag_pos_left:self.tag_pos_right]
+        return self.s[self.tag_pos_left:self.tag_pos_right]
+
+    def CanonicalTagName(self):
+        # type: () -> None
+        tag_name = self._LiteralTagName()
         # Most tags are already lower case, so avoid allocation with this conditional
         # TODO: this could go in the mycpp runtime?
         if tag_name.islower():
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index 281890b75..b29cb7216 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -229,16 +229,19 @@ def testScriptStyle(self):
         '''
         tokens = Lex(h)
 
-        self.assertEqual(
-            [
-                (Tok.RawData, 12),
-                (Tok.StartTag, 27),  # <script>
-                (Tok.HtmlCData, 78),  # JavaScript code is HTML CData
-                (Tok.EndTag, 87),  # </script>
-                (Tok.RawData, 96),  # \n
-                (Tok.EndOfStream, 96),  # \n
-            ],
-            tokens)
+        expected = [
+            (Tok.RawData, 12),
+            (Tok.StartTag, 27),  # <script>
+            (Tok.HtmlCData, 78),  # JavaScript code is HTML CData
+            (Tok.EndTag, 87),  # </script>
+            (Tok.RawData, 96),  # \n
+            (Tok.EndOfStream, 96),  # \n
+        ]
+        self.assertEqual(expected, tokens)
+
+        # Test case matching
+        tokens = Lex(h.replace('script', 'scrIPT'))
+        self.assertEqual(expected, tokens)
 
     def testScriptStyleXml(self):
         Tok = html.Tok
@@ -372,6 +375,8 @@ def testValid(self):
 
     # not allowed, but 3 > 4 is allowed
     '<a> 3 < 4 </a>',
+    # Not a CDATA tag
+    '<STYLEz><</STYLEz>',
 ]
 
 INVALID_PARSE = [
@@ -413,9 +418,12 @@ def testValid(self):
 
     # capital VOID tag
     '<META><a></a>',
-
     '<script><</script>',
-    '<SCRipt><</script>',
+    # matching
+    '<SCRipt><</SCRipt>',
+    '<SCRIPT><</SCRIPT>',
+    '<STYLE><</STYLE>',
+    #'<SCRipt><</script>',
 
     # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
     # flag to handle this!  Gah I want something faster.