Skip to content

Commit

Permalink
Added extra splitter for \0 only
Browse files Browse the repository at this point in the history
  • Loading branch information
jaytaph committed Nov 8, 2023
1 parent 9e19554 commit d90da26
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 0 deletions.
31 changes: 31 additions & 0 deletions src/html5/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2116,6 +2116,10 @@ impl<'chars> Html5Parser<'chars> {
/// Handle insertion mode "in_body"
fn handle_in_body(&mut self) {
match &self.current_token.clone() {
Token::Text(value) if self.current_token.is_mixed_null() => {
let tokens = self.split_mixed_token_null(value);
self.tokenizer.insert_tokens_at_queue_start(tokens);
}
Token::Text(..) if self.current_token.is_null() => {
self.parse_error("null character not allowed in in body insertion mode");
// ignore token
Expand Down Expand Up @@ -4021,6 +4025,33 @@ impl<'chars> Html5Parser<'chars> {

tokens
}

/// This will split tokens into \0 groups and non-\0 groups.
/// @todo: refactor this into split_mixed_token as well, but add a collection of groups callables
fn split_mixed_token_null(&self, text: &str) -> Vec<Token> {
let mut tokens = vec![];
let mut last_group = 'x';

let mut found = String::new();

for ch in text.chars() {
let group = if ch == '\0' { '0' } else { 'r' };

if last_group != group && !found.is_empty() {
tokens.push(Token::Text(found.clone()));
found.clear();
}

found.push(ch);
last_group = group;
}

if !found.is_empty() {
tokens.push(Token::Text(found.clone()));
}

tokens
}
}

#[cfg(test)]
Expand Down
11 changes: 11 additions & 0 deletions src/html5/tokenizer/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ pub enum Token {
}

impl Token {
/// Returns true when there is a mixture of white and non-white and \0 characters in the token
pub(crate) fn is_mixed(&self) -> bool {
// Check if there are white characters AND non-white characters in the token
if let Token::Text(value) = self {
Expand All @@ -53,6 +54,16 @@ impl Token {
false
}
}

/// Returns true when there is a mixture of \0 and non-\0 characters in the token
pub(crate) fn is_mixed_null(&self) -> bool {
// Check if there are white characters AND non-white characters in the token
if let Token::Text(value) = self {
value.chars().any(|ch| ch == '\0') && value.chars().any(|ch| ch != '\0')
} else {
false
}
}
}

impl Token {
Expand Down

0 comments on commit d90da26

Please sign in to comment.