Skip to content

Commit

Permalink
up
Browse files Browse the repository at this point in the history
  • Loading branch information
jaytaph committed Aug 14, 2023
1 parent 3dd9a04 commit b0c11a8
Showing 1 changed file with 73 additions and 16 deletions.
89 changes: 73 additions & 16 deletions src/html5_parser/consume_char_refs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -207,24 +207,64 @@ impl<'a> Tokenizer<'a> {

// This will consume any other matter that does not start with &# (ie: &raquo; &#copy;)
fn consume_anything_else(&mut self) -> Result<String, String> {


/*
"&copy;" -> "(c)" // case 1: simple entity terminated with ;
"&copyright;" -> "(c)" // case 2: another known entity that takes precedence over the earlier "copy" entity (but happens to be the same returning character)
"&copynot;" -> "(c)not" // case 3: unknown entity, but &copy is something, so return (c) plus the remainder until ;
"&copy " -> "(c)" // case 4: Terminated by the space, so it's ok
"&copya" -> "&copya" // case 5: Not terminated by a ; (end-of-stream) so "as-is"
"&copya " -> "&copya " // case 6: Terminated by a space, but not an entity (even though &copy is there), so "as-is"
"&copy" -> "&copy" // case 7: Not terminated by anything (end-of-stream), so "as-is"
*/

let mut s = String::new();
let mut current_match: Option<String> = None;

let mut captured: String::new(); = None;
let t = String::new();

loop {
c = self.stream.read_char();
if c == None {
// End of stream
break;
}

captured.push(c.unwrap());

if [' ', '&', '<'].contains(c.unwrap()) {
// Same as above, but we don't consume the character
break;
}

// If we find a ;, we also terminate, but we
if c.unwrap() == ';' {
if current_match.is_some() {
// Replace our entity with the correct char(acters) and add the "rest" (; or anything before)
let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str();
self.consume_string(value);
return Ok(String::new());
}
}

if let Some(c) = self.stream.read_char() {
// When we encounter a ;, we return
if c == ';' {
if current_match.is_some() {
if ! s.is_empty() {
s.push(';');
}
// Replace our entity with the correct char(acters) and add the "rest" (; or anything before)
let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str();
self.consume_string(value);
return Ok(String::new());
// When we encounter a terminating item (such as ;, but others might too), we return
if [';', ' ', '&', '<'].contains(&c) {
if current_match.is_none() {
// Nothing found that matches
return Err(String::new());
}

// add the current character to the string
if ! s.is_empty() {
s.push(c);
}

return Err(String::new());
// Replace our entity with the correct char(acters) and add the "rest" (; or anything before)
let value = TOKEN_NAMED_CHARS[current_match.unwrap().as_str()].to_string() + s.as_str();
self.consume_string(value);
return Ok(String::new());
}

// Add current read character to the string
Expand All @@ -251,8 +291,21 @@ impl<'a> Tokenizer<'a> {
current_match = Some(s.clone());
s = String::new();
}

// // This is an edge-case where we find a match, but no extra character later on (ie: "&copy").
// // In this case, it should return the string as-is.
// if self.stream.eof() {
// self.consume('&');
// self.consume_string(s);
// return Ok(String::new());
// }

} else {
self.consume('&');
if current_match.is_none() {
self.consume('&');
} else {
self.consume_string(current_match.unwrap());
}
self.consume_string(s);
return Ok(String::new());
}
Expand Down Expand Up @@ -283,6 +336,7 @@ mod tests {
}

token_tests! {
/*
// Numbers
token_0: ("&#10;", "str[\n]")
token_1: ("&#0;", "str[�]")
Expand All @@ -301,7 +355,7 @@ mod tests {
token_14: ("&#x0009;", "str[\t]")
token_15: ("&#x007F;", "str[]") // reserved codepoint
token_16: ("&#xFDD0;", "str[]") // reserved codepoint

*/
// Entities
token_100: ("&copy;", "str[©]")
token_101: ("&copyThing;", "str[©Thing;]")
Expand All @@ -313,11 +367,13 @@ mod tests {
token_107: ("&fo", "str[&fo]")
token_108: ("&xxx", "str[&xxx]")
token_109: ("&copy", "str[&copy]")
token_110: ("&copy ", "str[©]")
token_111: ("&copya", "str[&copya]")
token_110: ("&copy ", "str[© ]")
token_111: ("&copya", "str[©a]")
token_112: ("&copya;", "str[©a;]")
token_113: ("&#169;", "str[©]")
token_114: ("&copy&", "str[©&]")

/*
// ChatGPT generated tests
token_200: ("&copy;", "str[©]")
token_201: ("&copy ", "str[©]")
Expand Down Expand Up @@ -378,5 +434,6 @@ mod tests {
token_256: ("&unknownchar;", "str[&unknownchar;]")
token_257: ("&#9999999;", "str[�]")
token_259: ("&#11;", "str[&#11;]")
*/
}
}

0 comments on commit b0c11a8

Please sign in to comment.