From 45b53948d32c2aaa315f4b36a38bd7f41f0baa7b Mon Sep 17 00:00:00 2001 From: Joshua Thijssen Date: Mon, 14 Aug 2023 22:33:21 +0200 Subject: [PATCH] up --- src/html5_parser/consume_char_refs.rs | 58 +++++++++++++++------------ 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/src/html5_parser/consume_char_refs.rs b/src/html5_parser/consume_char_refs.rs index c13ffa6a9..571c405a0 100644 --- a/src/html5_parser/consume_char_refs.rs +++ b/src/html5_parser/consume_char_refs.rs @@ -209,45 +209,56 @@ impl<'a> Tokenizer<'a> { fn consume_anything_else(&mut self) -> Result { -/* -"©" -> "(c)" // case 1: simple entity terminated with ; -"©right;" -> "(c)" // case 2: another known entity that takes precedence over the earlier "copy" entity (but happens to be the same returning character) -"©not;" -> "(c)not" // case 3: unknown entity, but © is something, so return (c) plus the remainder until ; -"© " -> "(c)" // case 4: Terminated by the space, so it's ok -"©a" -> "©a" // case 5: Not terminated by a ; (end-of-stream) so "as-is" -"©a " -> "©a " // case 6: Terminated by a space, but not an entity (even though © is there), so "as-is" -"©" -> "©" // case 7: Not terminated by anything (end-of-stream), so "as-is" -*/ + /* + "©" -> "(c)" // case 1: simple entity terminated with ; + "©right;" -> "(c)" // case 2: another known entity that takes precedence over the earlier "copy" entity (but happens to be the same returning character) + "©not;" -> "(c)not" // case 3: unknown entity, but © is something, so return (c) plus the remainder until ; + "© " -> "(c)" // case 4: Terminated by the space, so it's ok + "©a" -> "©a" // case 5: Not terminated by a ; (end-of-stream) so "as-is" + "©a " -> "©a " // case 6: Terminated by a space, but not an entity (even though © is there), so "as-is" + "©" -> "©" // case 7: Not terminated by anything (end-of-stream), so "as-is" + */ - let mut s = String::new(); let mut current_match: Option = None; - let mut captured: String::new(); = None; - let t = String::new(); + let mut captured: String::new(); None; + let mut t = String::new(); + let mut s = String::new(); loop { - c = self.stream.read_char(); + let c = self.stream.read_char(); if c == None { - // End of stream - break; + // End of stream. Consume as-is (case 5 and 7) + self.consume_string(captured); + return Ok(string::new()); } captured.push(c.unwrap()); if [' ', '&', '<'].contains(c.unwrap()) { - // Same as above, but we don't consume the character - break; - } - - // If we find a ;, we also terminate, but we - if c.unwrap() == ';' { if current_match.is_some() { // Replace our entity with the correct char(acters) and add the "rest" (; or anything before) let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str(); self.consume_string(value); + self.consume(c.unwrap()); return Ok(String::new()); } } + if TOKEN_NAMED_CHARS.contains_key(&captured) { + current_match = Some(captured.clone()); + } + + // // If we find a ;, we also terminate, but we + // if c.unwrap() == ';' { + // if current_match.is_some() { + // // Replace our entity with the correct char(acters) and add the "rest" (; or anything before) + // let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str(); + // self.consume_string(value); + // // don't consume the ; + // return Ok(String::new()); + // } + // } + if let Some(c) = self.stream.read_char() { // When we encounter a terminating item (such as ;, but others might too), we return if [';', ' ', '&', '<'].contains(&c) { @@ -336,7 +347,6 @@ mod tests { } token_tests! { - /* // Numbers token_0: (" ", "str[\n]") token_1: ("�", "str[�]") @@ -355,7 +365,7 @@ mod tests { token_14: (" ", "str[\t]") token_15: ("", "str[]") // reserved codepoint token_16: ("﷐", "str[]") // reserved codepoint -*/ + // Entities token_100: ("©", "str[©]") token_101: ("©Thing;", "str[©Thing;]") @@ -373,7 +383,6 @@ mod tests { token_113: ("©", "str[©]") token_114: ("©&", "str[©&]") - /* // ChatGPT generated tests token_200: ("©", "str[©]") token_201: ("© ", "str[©]") @@ -434,6 +443,5 @@ mod tests { token_256: ("&unknownchar;", "str[&unknownchar;]") token_257: ("�", "str[�]") token_259: (" ", "str[ ]") - */ } }