diff --git a/doc/htm8.md b/doc/htm8.md index a21290104..cd8dd6b5e 100644 --- a/doc/htm8.md +++ b/doc/htm8.md @@ -3,15 +3,32 @@ in_progress: yes default_highlighter: oils-sh --- -HTM8 - Efficient HTML with Errors +HTM8 - An Easy Subset of HTML5, With Errors ================================= - Syntax Errors: It's a Subset -- Efficient +- Easy - Easy to Remember - Easy to Implement - Runs Efficiently - you don't have to materialize a big DOM tree, which causes many allocations +- Convertable to XML? + - without allocations, with a `sed`-like transformation! + - low level lexing and matching + +
@@ -125,11 +142,18 @@ Conflicts between HTML5 and XML: ### Converting to XML? -- Always quote all attributes -- Always quote `>` - are we alloxing this in HX8? -- Do something with ` ' - elif self.TagNameEquals('style'): - self.search_state = '' + # TODO: reduce allocations + if (self.TagNameEquals('script') or + self.TagNameEquals('style')): + # + self.search_state = '' return tok_id, m.end() else: @@ -346,12 +351,16 @@ def TagNameEquals(self, expected): # directly? return expected == self.CanonicalTagName() - def CanonicalTagName(self): + def _LiteralTagName(self): # type: () -> None assert self.tag_pos_left != -1, self.tag_pos_left assert self.tag_pos_right != -1, self.tag_pos_right - tag_name = self.s[self.tag_pos_left:self.tag_pos_right] + return self.s[self.tag_pos_left:self.tag_pos_right] + + def CanonicalTagName(self): + # type: () -> None + tag_name = self._LiteralTagName() # Most tags are already lower case, so avoid allocation with this conditional # TODO: this could go in the mycpp runtime? if tag_name.islower(): diff --git a/lazylex/html_test.py b/lazylex/html_test.py index 281890b75..b29cb7216 100755 --- a/lazylex/html_test.py +++ b/lazylex/html_test.py @@ -229,16 +229,19 @@ def testScriptStyle(self): ''' tokens = Lex(h) - self.assertEqual( - [ - (Tok.RawData, 12), - (Tok.StartTag, 27), # - (Tok.RawData, 96), # \n - (Tok.EndOfStream, 96), # \n - ], - tokens) + expected = [ + (Tok.RawData, 12), + (Tok.StartTag, 27), # + (Tok.RawData, 96), # \n + (Tok.EndOfStream, 96), # \n + ] + self.assertEqual(expected, tokens) + + # Test case matching + tokens = Lex(h.replace('script', 'scrIPT')) + self.assertEqual(expected, tokens) def testScriptStyleXml(self): Tok = html.Tok @@ -372,6 +375,8 @@ def testValid(self): # not allowed, but 3 > 4 is allowed ' 3 < 4 ', + # Not a CDATA tag + '<', ] INVALID_PARSE = [ @@ -413,9 +418,12 @@ def testValid(self): # capital VOID tag '', - '', - '', + # matching + '', + '', + '', + #'', # Note: Python HTMLParser.py does DYNAMIC compilation of regex with re.I # flag to handle this! Gah I want something faster.