diff --git a/doc/htm8.md b/doc/htm8.md
index a21290104..cd8dd6b5e 100644
--- a/doc/htm8.md
+++ b/doc/htm8.md
@@ -3,15 +3,32 @@ in_progress: yes
default_highlighter: oils-sh
---
-HTM8 - Efficient HTML with Errors
+HTM8 - An Easy Subset of HTML5, With Errors
=================================
- Syntax Errors: It's a Subset
-- Efficient
+- Easy
- Easy to Remember
- Easy to Implement
- Runs Efficiently - you don't have to materialize a big DOM tree, which
causes many allocations
+- Convertable to XML?
+ - without allocations, with a `sed`-like transformation!
+ - low level lexing and matching
+
+
@@ -125,11 +142,18 @@ Conflicts between HTML5 and XML:
### Converting to XML?
-- Always quote all attributes
-- Always quote `>` - are we alloxing this in HX8?
-- Do something with ` '
- elif self.TagNameEquals('style'):
- self.search_state = ''
+ # TODO: reduce allocations
+ if (self.TagNameEquals('script') or
+ self.TagNameEquals('style')):
+ #
+ self.search_state = '' + self._LiteralTagName() + '>'
return tok_id, m.end()
else:
@@ -346,12 +351,16 @@ def TagNameEquals(self, expected):
# directly?
return expected == self.CanonicalTagName()
- def CanonicalTagName(self):
+ def _LiteralTagName(self):
# type: () -> None
assert self.tag_pos_left != -1, self.tag_pos_left
assert self.tag_pos_right != -1, self.tag_pos_right
- tag_name = self.s[self.tag_pos_left:self.tag_pos_right]
+ return self.s[self.tag_pos_left:self.tag_pos_right]
+
+ def CanonicalTagName(self):
+ # type: () -> None
+ tag_name = self._LiteralTagName()
# Most tags are already lower case, so avoid allocation with this conditional
# TODO: this could go in the mycpp runtime?
if tag_name.islower():
diff --git a/lazylex/html_test.py b/lazylex/html_test.py
index 281890b75..b29cb7216 100755
--- a/lazylex/html_test.py
+++ b/lazylex/html_test.py
@@ -229,16 +229,19 @@ def testScriptStyle(self):
'''
tokens = Lex(h)
- self.assertEqual(
- [
- (Tok.RawData, 12),
- (Tok.StartTag, 27), #
- (Tok.RawData, 96), # \n
- (Tok.EndOfStream, 96), # \n
- ],
- tokens)
+ expected = [
+ (Tok.RawData, 12),
+ (Tok.StartTag, 27), #
+ (Tok.RawData, 96), # \n
+ (Tok.EndOfStream, 96), # \n
+ ]
+ self.assertEqual(expected, tokens)
+
+ # Test case matching
+ tokens = Lex(h.replace('script', 'scrIPT'))
+ self.assertEqual(expected, tokens)
def testScriptStyleXml(self):
Tok = html.Tok
@@ -372,6 +375,8 @@ def testValid(self):
# not allowed, but 3 > 4 is allowed
' 3 < 4 ',
+ # Not a CDATA tag
+ '<',
]
INVALID_PARSE = [
@@ -413,9 +418,12 @@ def testValid(self):
# capital VOID tag
'',
-
'',
- '',
+ # matching
+ '',
+ '',
+ '',
+ #'',
# Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I
# flag to handle this! Gah I want something faster.