diff --git a/src/html5/parser.rs b/src/html5/parser.rs index ddc835f2f..511e3f198 100644 --- a/src/html5/parser.rs +++ b/src/html5/parser.rs @@ -2116,6 +2116,10 @@ impl<'chars> Html5Parser<'chars> { /// Handle insertion mode "in_body" fn handle_in_body(&mut self) { match &self.current_token.clone() { + Token::Text(value) if self.current_token.is_mixed_null() => { + let tokens = self.split_mixed_token_null(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_null() => { self.parse_error("null character not allowed in in body insertion mode"); // ignore token @@ -4021,6 +4025,33 @@ impl<'chars> Html5Parser<'chars> { tokens } + + /// This will split tokens into \0 groups and non-\0 groups. + /// @todo: refactor this into split_mixed_token as well, but add a collection of groups callables + fn split_mixed_token_null(&self, text: &str) -> Vec { + let mut tokens = vec![]; + let mut last_group = 'x'; + + let mut found = String::new(); + + for ch in text.chars() { + let group = if ch == '\0' { '0' } else { 'r' }; + + if last_group != group && !found.is_empty() { + tokens.push(Token::Text(found.clone())); + found.clear(); + } + + found.push(ch); + last_group = group; + } + + if !found.is_empty() { + tokens.push(Token::Text(found.clone())); + } + + tokens + } } #[cfg(test)] diff --git a/src/html5/tokenizer/token.rs b/src/html5/tokenizer/token.rs index 377551d75..f286ceb1b 100644 --- a/src/html5/tokenizer/token.rs +++ b/src/html5/tokenizer/token.rs @@ -31,6 +31,7 @@ pub enum Token { } impl Token { + /// Returns true when there is a mixture of white and non-white and \0 characters in the token pub(crate) fn is_mixed(&self) -> bool { // Check if there are white characters AND non-white characters in the token if let Token::Text(value) = self { @@ -53,6 +54,16 @@ impl Token { false } } + + /// Returns true when there is a mixture of \0 and non-\0 characters in the token + pub(crate) fn is_mixed_null(&self) -> bool { + // Check if there are white characters AND non-white characters in the token + if let Token::Text(value) = self { + value.chars().any(|ch| ch == '\0') && value.chars().any(|ch| ch != '\0') + } else { + false + } + } } impl Token {