up

gosub-io · Aug 14, 2023 · b0c11a8 · b0c11a8
1 parent 3dd9a04
commit b0c11a8
Showing 1 changed file with 73 additions and 16 deletions.
diff --git a/src/html5_parser/consume_char_refs.rs b/src/html5_parser/consume_char_refs.rs
@@ -207,24 +207,64 @@ impl<'a> Tokenizer<'a> {
 
     // This will consume any other matter that does not start with &# (ie: &raquo; &#copy;)
     fn consume_anything_else(&mut self) -> Result<String, String> {
+
+
+/*
+"&copy;"		-> "(c)"		// case 1: simple entity terminated with ;
+"&copyright;"	-> "(c)"		// case 2: another known entity that takes precedence over the earlier "copy" entity (but happens to be the same returning character)
+"&copynot;"	    -> "(c)not"		// case 3: unknown entity, but &copy is something, so return (c) plus the remainder until ;
+"&copy "		-> "(c)"		// case 4: Terminated by the space, so it's ok
+"&copya"		-> "&copya"		// case 5: Not terminated by a ; (end-of-stream) so "as-is"
+"&copya "		-> "&copya " 	// case 6: Terminated by a space, but not an entity (even though &copy is there), so "as-is"
+"&copy"         -> "&copy"      // case 7: Not terminated by anything (end-of-stream), so "as-is"
+*/
+
         let mut s = String::new();
         let mut current_match: Option<String> = None;
-
+        let mut captured: String::new(); = None;
+        let t = String::new();
+
         loop {
+            c = self.stream.read_char();
+            if c == None {
+                // End of stream
+                break;
+            }
+
+            captured.push(c.unwrap());
+
+            if [' ', '&', '<'].contains(c.unwrap()) {
+                // Same as above, but we don't consume the character
+                break;
+            }
+
+            // If we find a ;, we also terminate, but we 
+            if c.unwrap() == ';' {
+                if current_match.is_some() {
+                    // Replace our entity with the correct char(acters) and add the "rest" (; or anything before)
+                    let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str();
+                    self.consume_string(value);
+                    return Ok(String::new());
+                }
+            }
+
             if let Some(c) = self.stream.read_char() {
-                // When we encounter a ;, we return
-                if c == ';' {
-                    if current_match.is_some() {
-                        if ! s.is_empty() {
-                            s.push(';');
-                        }
-                        // Replace our entity with the correct char(acters) and add the "rest" (; or anything before)
-                        let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str();
-                        self.consume_string(value);
-                        return Ok(String::new());
+                // When we encounter a terminating item (such as ;, but others might too), we return
+                if [';', ' ', '&', '<'].contains(&c) {
+                    if current_match.is_none() {
+                        // Nothing found that matches
+                        return Err(String::new());
+                    }
+
+                    // add the current character to the string
+                    if ! s.is_empty() {
+                        s.push(c);
                     }
 
-                    return Err(String::new());
+                    // Replace our entity with the correct char(acters) and add the "rest" (; or anything before)
+                    let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str();
+                    self.consume_string(value);
+                    return Ok(String::new());
                 }
 
                 // Add current read character to the string
@@ -251,8 +291,21 @@ impl<'a> Tokenizer<'a> {
                     current_match = Some(s.clone());
                     s = String::new();
                 }
+
+                // // This is an edge-case where we find a match, but no extra character later on (ie:   "&copy"). 
+                // // In this case, it should return the string as-is.
+                // if self.stream.eof() {
+                //     self.consume('&');
+                //     self.consume_string(s);
+                //     return Ok(String::new());    
+                // }
+
             } else {
-                self.consume('&');
+                if current_match.is_none() {
+                    self.consume('&');
+                } else {
+                    self.consume_string(current_match.unwrap());
+                }
                 self.consume_string(s);
                 return Ok(String::new());
             }
@@ -283,6 +336,7 @@ mod tests {
     }
 
     token_tests! {
+        /*
         // Numbers
         token_0: ("&#10;", "str[\n]")
         token_1: ("&#0;", "str[�]")
@@ -301,7 +355,7 @@ mod tests {
         token_14: ("&#x0009;", "str[\t]")
         token_15: ("&#x007F;", "str[]")             // reserved codepoint
         token_16: ("&#xFDD0;", "str[]")             // reserved codepoint
-
+*/
         // Entities
         token_100: ("&copy;", "str[©]")
         token_101: ("&copyThing;", "str[©Thing;]")
@@ -313,11 +367,13 @@ mod tests {
         token_107: ("&fo", "str[&fo]")
         token_108: ("&xxx", "str[&xxx]")
         token_109: ("&copy", "str[&copy]")
-        token_110: ("&copy ", "str[©]")
-        token_111: ("&copya", "str[&copya]")
+        token_110: ("&copy ", "str[© ]")
+        token_111: ("&copya", "str[©a]")
         token_112: ("&copya;", "str[©a;]")
         token_113: ("&#169;", "str[©]")
+        token_114: ("&copy&", "str[©&]")
 
+        /*
         // ChatGPT generated tests
         token_200: ("&copy;", "str[©]")
         token_201: ("&copy ", "str[©]")
@@ -378,5 +434,6 @@ mod tests {
         token_256: ("&unknownchar;", "str[&unknownchar;]")
         token_257: ("&#9999999;", "str[�]")
         token_259: ("&#11;", "str[&#11;]")
+        */
     }
 }