diff --git a/src/bin/parser-test.rs b/src/bin/parser-test.rs index 45ccb7f8e..9818e146a 100755 --- a/src/bin/parser-test.rs +++ b/src/bin/parser-test.rs @@ -29,7 +29,7 @@ fn main() { tests_failed: Vec::new(), }; - let filenames = Some(&["html5test-com.dat"][..]); + let filenames = Some(&["tests15.dat"][..]); let fixtures = read_fixtures(filenames).expect("fixtures"); for fixture_file in fixtures { @@ -41,7 +41,7 @@ fn main() { let mut test_idx = 1; for test in fixture_file.tests { - if test_idx == 14 { + if test_idx == 10 { run_test(test_idx, test, &mut results); } test_idx += 1; diff --git a/src/bytes.rs b/src/bytes.rs index e63737def..3e9359ec4 100644 --- a/src/bytes.rs +++ b/src/bytes.rs @@ -114,15 +114,6 @@ pub struct CharIterator { pub has_read_eof: bool, // True when we just read an EOF } -pub enum SeekMode { - /// Seek from the start of the stream - SeekSet, - /// Seek from the current stream position - SeekCur, - /// Seek (backwards) from the end of the stream - SeekEnd, -} - impl Default for CharIterator { fn default() -> Self { Self::new() @@ -170,76 +161,30 @@ impl CharIterator { self.position.col = 1; } - /// Seek explicit offset in the stream (based on chars) - pub fn seek(&mut self, mode: SeekMode, offset: isize) { - let abs_offset = match mode { - SeekMode::SeekSet => { - if offset.is_negative() { - 0 - } else { - offset as usize - } - } - SeekMode::SeekCur => { - if offset.is_negative() { - self.position.offset - offset.unsigned_abs() - } else { - self.position.offset + offset as usize - } - } - SeekMode::SeekEnd => { - // Both -5 and 5 on seek-end do the same thing - if offset.abs() > self.length as isize { - 0 - } else { - self.length - offset.unsigned_abs() - } - } - }; + /// Skip offset characters in the stream (based on chars) + pub fn skip(&mut self, offset: usize) { + let mut skip_len = offset; + if self.position.offset + offset > self.length { + skip_len = self.length - self.position.offset; + } - self.position = self.generate_position(abs_offset); + for _ in 0..skip_len { + self.read_char(); + } } /// Returns the previous position based on the current position pub fn get_previous_position(&mut self) -> Position { - // if we are at the begining or the end of the stream, we just return the current position + // if we are at the beginning or the end of the stream, we just return the current position if self.position.offset == 0 || self.has_read_eof { return self.position; } - self.generate_position(self.position.offset - 1) - } + self.unread(); + let pos = self.position; + self.skip(1); - /// Generate a new position structure for given offset - fn generate_position(&mut self, abs_offset: usize) -> Position { - let mut abs_offset = abs_offset; - - // Cap to length if we read past the end of the stream - if abs_offset > self.length + 1 { - abs_offset = self.length; - self.has_read_eof = true; - } - - // Detect lines (if needed) - self.read_line_endings_until(abs_offset); - - let mut last_line: usize = 0; - let mut last_offset = self.line_offsets[last_line]; - for i in 0..self.line_offsets.len() { - if self.line_offsets[i] > abs_offset { - break; - } - - last_line = i; - last_offset = self.line_offsets[last_line]; - } - - // Set position values - Position { - offset: abs_offset, - line: last_line + 1, - col: abs_offset - last_offset + 1, - } + pos } /// Returns the current offset in the stream @@ -361,15 +306,26 @@ impl CharIterator { // If we still can move forward in the stream, move forwards if self.position.offset < self.length { let c = self.buffer[self.position.offset]; - self.seek(SeekMode::SeekCur, 1); + if c == Ch('\n') { + // Store line offset for the given line + if self.line_offsets.len() > self.position.line { + self.line_offsets[self.position.line] = self.position.offset; + } else { + self.line_offsets.push(self.position.offset); + } + // And continue position on the next line + self.position.line += 1; + self.position.col = 1; + } else { + self.position.col += 1; + } + self.position.offset += 1; return c; } // otherwise, we have reached the end of the stream self.has_read_eof = true; - self.seek(SeekMode::SeekEnd, 0); - Eof } @@ -392,7 +348,14 @@ impl CharIterator { // If we can track back from the offset, we can do so if self.position.offset > 0 { - self.seek(SeekMode::SeekCur, -1); + self.position.offset -= 1; + + if self.position.col == 1 { + self.position.line -= 1; + self.position.col = self.line_offsets[self.position.line]; + } else { + self.position.col -= 1; + } } } @@ -414,26 +377,6 @@ impl CharIterator { self.buffer[self.position.offset + offset] } - - /// Populates the line endings by reading the stream until the given length. - fn read_line_endings_until(&mut self, abs_offset: usize) { - let mut last_offset = *self.line_offsets.last().unwrap(); - - while last_offset <= abs_offset { - if last_offset >= self.length { - self.line_offsets.push(last_offset + 1); - break; - } - - // Check the next char to see if it's a '\n' - let c = self.buffer[last_offset]; - if c == Ch('\n') { - self.line_offsets.push(last_offset + 1); - } - - last_offset += 1; - } - } } #[cfg(test)] @@ -515,238 +458,6 @@ mod test { assert!(!chars.is_certain_encoding()); } - #[test] - fn test_offsets() { - let mut chars = CharIterator::new(); - chars.read_from_str("abc", Some(Encoding::UTF8)); - assert_eq!( - chars.position, - Position { - offset: 0, - line: 1, - col: 1 - } - ); - assert_eq!('a', chars.read_char().into()); - assert_eq!( - chars.position, - Position { - offset: 1, - line: 1, - col: 2 - } - ); - assert_eq!('b', chars.read_char().into()); - assert_eq!( - chars.position, - Position { - offset: 2, - line: 1, - col: 3 - } - ); - assert_eq!('c', chars.read_char().into()); - assert_eq!( - chars.position, - Position { - offset: 3, - line: 1, - col: 4 - } - ); - assert!(matches!(chars.read_char(), Eof)); - assert_eq!( - chars.position, - Position { - offset: 3, - line: 1, - col: 4 - } - ); - assert!(matches!(chars.read_char(), Eof)); - assert_eq!( - chars.position, - Position { - offset: 3, - line: 1, - col: 4 - } - ); - - let mut chars = CharIterator::new(); - chars.read_from_str( - "abc\ndefg\n\nhi\njk\nlmno\n\n\npqrst\nu\nv\nw\n\nxy\nz", - Some(Encoding::UTF8), - ); - assert_eq!(chars.length, 40); - - chars.seek(SeekMode::SeekSet, 0); - assert_eq!( - chars.position, - Position { - offset: 0, - line: 1, - col: 1 - } - ); - let c = chars.read_char(); - assert_eq!(c, Ch('a')); - assert_eq!( - chars.position, - Position { - offset: 1, - line: 1, - col: 2 - } - ); - - chars.seek(SeekMode::SeekSet, 7); - assert_eq!( - chars.position, - Position { - offset: 7, - line: 2, - col: 4 - } - ); - assert_eq!(chars.chars_left(), 33); - - let c = chars.read_char(); - assert_eq!(c, Ch('g')); - assert_eq!( - chars.position, - Position { - offset: 8, - line: 2, - col: 5 - } - ); - - let c = chars.read_char(); - assert_eq!(c, Ch('\n')); - assert_eq!( - chars.position, - Position { - offset: 9, - line: 3, - col: 1 - } - ); - - let c = chars.read_char(); - assert_eq!(c, Ch('\n')); - assert_eq!( - chars.position, - Position { - offset: 10, - line: 4, - col: 1 - } - ); - - let c = chars.read_char(); - assert_eq!(c, Ch('h')); - assert_eq!( - chars.position, - Position { - offset: 11, - line: 4, - col: 2 - } - ); - assert_eq!(chars.chars_left(), 29); - - chars.reset(); - assert_eq!( - chars.position, - Position { - offset: 0, - line: 1, - col: 1 - } - ); - assert_eq!(chars.chars_left(), 40); - - chars.seek(SeekMode::SeekSet, 100); - assert_eq!( - chars.position, - Position { - offset: 40, - line: 15, - col: 2 - } - ); - assert_eq!(chars.chars_left(), 0); - } - - #[test] - fn test_seek() { - let mut chars = CharIterator::new(); - chars.read_from_str("ab👽cd", Some(Encoding::UTF8)); - assert_eq!(chars.length, 5); - assert_eq!(chars.chars_left(), 5); - assert_eq!(chars.read_char(), Ch('a')); - assert_eq!(chars.read_char(), Ch('b')); - assert_eq!(chars.chars_left(), 3); - chars.seek(SeekMode::SeekSet, 0); - assert_eq!(chars.chars_left(), 5); - assert_eq!(chars.read_char(), Ch('a')); - assert_eq!(chars.read_char(), Ch('b')); - assert_eq!(chars.chars_left(), 3); - chars.seek(SeekMode::SeekSet, 3); - assert_eq!(chars.chars_left(), 2); - assert_eq!(chars.read_char(), Ch('c')); - assert_eq!(chars.read_char(), Ch('d')); - assert_eq!(chars.chars_left(), 0); - assert!(chars.eof()); - - chars.reset(); - assert_eq!(chars.look_ahead(0), Ch('a')); - assert_eq!(chars.look_ahead(3), Ch('c')); - assert_eq!(chars.look_ahead(1), Ch('b')); - assert!(matches!(chars.look_ahead(100), Eof)); - - chars.seek(SeekMode::SeekSet, 0); - assert_eq!(chars.look_ahead_slice(1), "a"); - assert_eq!(chars.look_ahead_slice(2), "ab"); - assert_eq!(chars.look_ahead_slice(3), "ab👽"); - assert_eq!(chars.look_ahead_slice(4), "ab👽c"); - assert_eq!(chars.look_ahead_slice(5), "ab👽cd"); - assert_eq!(chars.look_ahead_slice(6), "ab👽cd"); - assert_eq!(chars.look_ahead_slice(100), "ab👽cd"); - - chars.seek(SeekMode::SeekSet, 3); - assert_eq!(chars.look_ahead_slice(1), "c"); - assert_eq!(chars.look_ahead_slice(2), "cd"); - - chars.seek(SeekMode::SeekSet, 0); - assert_eq!(chars.position.offset, 0); - - chars.seek(SeekMode::SeekSet, 3); - assert_eq!(chars.position.offset, 3); - - chars.seek(SeekMode::SeekCur, 0); - assert_eq!(chars.position.offset, 3); - - chars.seek(SeekMode::SeekCur, 1); - assert_eq!(chars.position.offset, 4); - - chars.seek(SeekMode::SeekCur, -2); - assert_eq!(chars.position.offset, 2); - - chars.seek(SeekMode::SeekCur, 10); - assert_eq!(chars.position.offset, 5); - - chars.seek(SeekMode::SeekSet, 100); - assert_eq!(chars.position.offset, 5); - - chars.seek(SeekMode::SeekSet, -100); - assert_eq!(chars.position.offset, 0); - - chars.seek(SeekMode::SeekEnd, -100); - assert_eq!(chars.position.offset, 0); - } - #[test] fn test_eof() { let mut chars = CharIterator::new(); diff --git a/src/html5/parser.rs b/src/html5/parser.rs index ad11b36d4..b7d807f1f 100644 --- a/src/html5/parser.rs +++ b/src/html5/parser.rs @@ -17,7 +17,7 @@ use crate::html5::parser::document::{Document, DocumentBuilder, DocumentFragment use crate::html5::parser::quirks::QuirksMode; use crate::html5::tokenizer::state::State; use crate::html5::tokenizer::token::Token; -use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_NUL, CHAR_REPLACEMENT}; +use crate::html5::tokenizer::{ParserData, Tokenizer, CHAR_REPLACEMENT}; use crate::types::{ParseError, Result}; use alloc::rc::Rc; use core::cell::RefCell; @@ -444,6 +444,11 @@ impl<'chars> Html5Parser<'chars> { let mut handle_as_script_endtag = false; match &self.current_token.clone() { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + return; + } Token::Text(..) if self.current_token.is_null() => { self.parse_error("null character not allowed in foreign content"); self.insert_text_element(&Token::Text(CHAR_REPLACEMENT.to_string())); @@ -630,23 +635,24 @@ impl<'chars> Html5Parser<'chars> { fn process_html_content(&mut self) { if self.ignore_lf { if let Token::Text(value) = &self.current_token { - if value.eq(&"\n".to_string()) { - self.current_token = self.fetch_next_token(); + if value.starts_with('\n') { + // We don't need to skip 1 char, but we can skip 1 byte, as we just checked for \n + self.current_token = Token::Text(value.chars().skip(1).collect::()); } } self.ignore_lf = false; } - // // Break when we reach the end of the token chars - // if self.current_token.is_eof() { - // self.parser_finished = true; - // } - match self.insertion_mode { InsertionMode::Initial => { let mut anything_else = false; match &self.current_token.clone() { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + return; + } Token::Text(..) if self.current_token.is_empty_or_white() => { // ignore token } @@ -726,6 +732,11 @@ impl<'chars> Html5Parser<'chars> { Some(NodeId::root()), ); } + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + return; + } Token::Text(..) if self.current_token.is_empty_or_white() => { // ignore token } @@ -763,6 +774,11 @@ impl<'chars> Html5Parser<'chars> { let mut anything_else = false; match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + return; + } Token::Text(..) if self.current_token.is_empty_or_white() => { // ignore token } @@ -824,6 +840,11 @@ impl<'chars> Html5Parser<'chars> { self.check_last_element("head"); self.insertion_mode = InsertionMode::InHead; } + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + return; + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.handle_in_head(); } @@ -871,6 +892,11 @@ impl<'chars> Html5Parser<'chars> { let mut anything_else = false; match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + return; + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.insert_text_element(&self.current_token.clone()); } @@ -999,6 +1025,10 @@ impl<'chars> Html5Parser<'chars> { InsertionMode::InTable => self.handle_in_table(), InsertionMode::InTableText => { match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_null() => { self.parse_error( "null character not allowed in in table text insertion mode", @@ -1006,18 +1036,10 @@ impl<'chars> Html5Parser<'chars> { // ignore token } Token::Text(value) => { - for c in value.chars() { - if c == CHAR_NUL { - self.parse_error( - "null character not allowed in in table insertion mode", - ); - } else { - self.pending_table_character_tokens.push(c); - } - } + self.pending_table_character_tokens.push_str(value); } _ => { - let tokens = self.pending_table_character_tokens.clone(); + let pending_chars = self.pending_table_character_tokens.clone(); let mut process_as_intable_anything_else = false; @@ -1033,15 +1055,18 @@ impl<'chars> Html5Parser<'chars> { if process_as_intable_anything_else { let tmp = self.current_token.clone(); - self.current_token = Token::Text(tokens); - self.foster_parenting = true; - self.handle_in_body(); - self.foster_parenting = false; + let tokens = self.split_mixed_token(&pending_chars); + for token in tokens { + self.current_token = token; + self.handle_in_body(); + } + + self.foster_parenting = false; self.current_token = tmp; } else { - self.insert_text_element(&Token::Text(tokens)); + self.insert_text_element(&Token::Text(pending_chars)); } self.pending_table_character_tokens.clear(); @@ -1115,6 +1140,10 @@ impl<'chars> Html5Parser<'chars> { } InsertionMode::InColumnGroup => { match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.insert_text_element(&self.current_token.clone()); } @@ -1468,6 +1497,10 @@ impl<'chars> Html5Parser<'chars> { InsertionMode::InTemplate => self.handle_in_template(), InsertionMode::AfterBody => { match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.handle_in_body(); } @@ -1508,6 +1541,10 @@ impl<'chars> Html5Parser<'chars> { } InsertionMode::InFrameset => { match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.insert_text_element(&self.current_token.clone()); } @@ -1566,6 +1603,10 @@ impl<'chars> Html5Parser<'chars> { } InsertionMode::AfterFrameset => { match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.insert_text_element(&self.current_token.clone()); } @@ -1603,6 +1644,10 @@ impl<'chars> Html5Parser<'chars> { Token::DocType { .. } => { self.handle_in_body(); } + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.handle_in_body(); } @@ -1631,6 +1676,10 @@ impl<'chars> Html5Parser<'chars> { Token::DocType { .. } => { self.handle_in_body(); } + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.handle_in_body(); } @@ -2067,21 +2116,23 @@ impl<'chars> Html5Parser<'chars> { /// Handle insertion mode "in_body" fn handle_in_body(&mut self) { match &self.current_token.clone() { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_null() => { self.parse_error("null character not allowed in in body insertion mode"); // ignore token } - Token::Text(..) if self.current_token.is_empty_or_white() => { - self.reconstruct_formatting(); - - self.insert_text_element(&self.current_token.clone()); - } Token::Text(..) => { self.reconstruct_formatting(); self.insert_text_element(&self.current_token.clone()); - self.frameset_ok = false; + // If this mixed token does not have whitespace chars, set frameset_ok to false + if !self.current_token.is_empty_or_white() { + self.frameset_ok = false; + } } Token::Comment(..) => { self.insert_comment_element(&self.current_token.clone(), None); @@ -2906,6 +2957,11 @@ impl<'chars> Html5Parser<'chars> { let mut anything_else = false; match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + return; + } Token::Text(..) if self.current_token.is_empty_or_white() => { self.insert_text_element(&self.current_token.clone()); } @@ -3306,6 +3362,10 @@ impl<'chars> Html5Parser<'chars> { /// Handle insertion mode "in_select" fn handle_in_select(&mut self) { match &self.current_token { + Token::Text(value) if self.current_token.is_mixed() => { + let tokens = self.split_mixed_token(value); + self.tokenizer.insert_tokens_at_queue_start(tokens); + } Token::Text(..) if self.current_token.is_null() => { self.parse_error("null character not allowed in in select insertion mode"); // ignore token @@ -3781,9 +3841,10 @@ impl<'chars> Html5Parser<'chars> { .expect("tokenizer error"); if let Token::Text(value) = token { - for c in value.chars() { - self.token_queue.push(Token::Text(c.to_string())); - } + self.token_queue.push(Token::Text(value)); + // for c in value.chars() { + // self.token_queue.push(Token::Text(c.to_string())); + // } } else { // Simply return the token return token; @@ -3907,6 +3968,63 @@ impl<'chars> Html5Parser<'chars> { self.tokenizer .set_state(self.find_initial_state_for_context(context_node)); } + + /// Splits a regular text token with mixed characters into tokens of 3 groups: + /// null-characters, (ascii) whitespaces, and regular (rest) characters. + /// These tokens are then inserted into the token buffer queue so they can get parsed + /// correctly. + /// + /// example: + /// + /// Token::Text(" foo bar\0 ") + /// + /// is split into 6 tokens: + /// + /// Token::Text(" ") // whitespace + /// Token::Text("foo") // regular + /// Token::Text(" ") // whitespace + /// Token::Text("bar") // regular + /// Token::Text("\0") // null + /// Token::Text(" ") // whitespace + /// + /// This is needed because the tokenizer does not know about the context of the text it is + /// so it will always try to tokenize as greedy as possible. But sometimes we need this split + /// to happen where a differentation between whitespaces, null and regular characters are needed. + /// Only in those cases, this function is called, and the token will be split into multiple + /// tokens. + /// The idea is that large blobs of javascript for instance will not be split into separate + /// tokens, but still be seen and parsed as a single TextToken. + /// + fn split_mixed_token(&self, text: &String) -> Vec { + let mut tokens = vec![]; + let mut last_group = 'x'; + + let mut found = String::new(); + + for ch in text.chars() { + let group = if ch == '\0' { + '0' + } else if ch.is_ascii_whitespace() { + 'w' + } else { + 'r' + }; + + if last_group != group && !found.is_empty() { + tokens.push(Token::Text(found.clone())); + found.clear(); + } + + found.push(ch); + last_group = group; + } + + if !found.is_empty() { + tokens.push(Token::Text(found.clone())); + } + + tokens + } } #[cfg(test)] diff --git a/src/html5/parser/helper.rs b/src/html5/parser/helper.rs index c4b17f2cf..8fcbba369 100644 --- a/src/html5/parser/helper.rs +++ b/src/html5/parser/helper.rs @@ -240,6 +240,13 @@ impl Html5Parser<'_> { } pub fn insert_text_element(&mut self, token: &Token) { + // Skip empty text nodes + if let Token::Text(text) = token { + if text.is_empty() { + return; + } + } + let insertion_position = self.appropriate_place_insert(None); // TODO, for text element, if the insertion_position is Document, should not do next step. self.insert_text_helper(insertion_position, token); diff --git a/src/html5/tokenizer.rs b/src/html5/tokenizer.rs index 931370f9e..b6580f5d4 100644 --- a/src/html5/tokenizer.rs +++ b/src/html5/tokenizer.rs @@ -5,7 +5,6 @@ mod character_reference; mod replacement_tables; use crate::bytes::Bytes::{self, *}; -use crate::bytes::SeekMode::SeekCur; use crate::bytes::{CharIterator, Position}; use crate::html5::error_logger::{ErrorLogger, ParserError}; use crate::html5::node::HTML_NAMESPACE; @@ -51,6 +50,15 @@ pub struct Tokenizer<'stream> { pub error_logger: Rc>, } +impl<'stream> Tokenizer<'stream> { + pub(crate) fn insert_tokens_at_queue_start(&mut self, first_tokens: Vec) { + let mut new_queue = first_tokens.clone(); + new_queue.extend(self.token_queue.iter().cloned()); + + self.token_queue = new_queue; + } +} + /// This struct is a gateway between the parser and the tokenizer. It holds data that can be needed /// by the tokenizer in certain cases. See https://github.com/gosub-browser/gosub-engine/issues/230 for /// more information and how we should refactor this properly. @@ -158,7 +166,6 @@ impl<'stream> Tokenizer<'stream> { self.parse_error(ParserError::UnexpectedNullCharacter); } Eof => { - // EOF // if self.has_consumed_data() { // self.emit_token(Token::TextToken { value: self.get_consumed_str() }); // self.clear_consume_buffer(); @@ -1211,20 +1218,20 @@ impl<'stream> Tokenizer<'stream> { self.current_token = Some(Token::Comment("".into())); // Skip the two -- signs - self.chars.seek(SeekCur, 2); + self.chars.skip(2); self.state = State::CommentStart; continue; } if self.chars.look_ahead_slice(7).to_uppercase() == "DOCTYPE" { - self.chars.seek(SeekCur, 7); + self.chars.skip(7); self.state = State::DOCTYPE; continue; } if self.chars.look_ahead_slice(7) == "[CDATA[" { - self.chars.seek(SeekCur, 7); + self.chars.skip(7); if parser_data.adjusted_node_namespace != HTML_NAMESPACE { self.state = State::CDATASection; @@ -1238,9 +1245,9 @@ impl<'stream> Tokenizer<'stream> { continue; } - self.chars.seek(SeekCur, 1); + // self.chars.skip(1); self.parse_error(ParserError::IncorrectlyOpenedComment); - self.chars.unread(); + // self.chars.unread(); self.current_token = Some(Token::Comment("".into())); self.state = State::BogusComment; @@ -1568,20 +1575,20 @@ impl<'stream> Tokenizer<'stream> { _ => { self.chars.unread(); if self.chars.look_ahead_slice(6).to_uppercase() == "PUBLIC" { - self.chars.seek(SeekCur, 6); + self.chars.skip(6); self.state = State::AfterDOCTYPEPublicKeyword; continue; } if self.chars.look_ahead_slice(6).to_uppercase() == "SYSTEM" { - self.chars.seek(SeekCur, 6); + self.chars.skip(6); self.state = State::AfterDOCTYPESystemKeyword; continue; } // Make sure the parser is on the correct position again since we just // unread the character - self.chars.seek(SeekCur, 1); + self.chars.skip(1); self.parse_error(ParserError::InvalidCharacterSequenceAfterDoctypeName); - self.chars.seek(SeekCur, -1); + self.chars.unread(); self.set_quirks_mode(true); self.chars.unread(); self.state = State::BogusDOCTYPE; diff --git a/src/html5/tokenizer/character_reference.rs b/src/html5/tokenizer/character_reference.rs index 5d2796cb0..86bc0a8b8 100644 --- a/src/html5/tokenizer/character_reference.rs +++ b/src/html5/tokenizer/character_reference.rs @@ -1,12 +1,8 @@ extern crate lazy_static; -use crate::bytes::{ - Bytes::{self, *}, - SeekMode::SeekCur, -}; +use crate::bytes::Bytes::{self, *}; use crate::html5::error_logger::ParserError; use crate::html5::tokenizer::replacement_tables::{TOKEN_NAMED_CHARS, TOKEN_REPLACEMENTS}; - use crate::html5::tokenizer::{Tokenizer, CHAR_REPLACEMENT}; use lazy_static::lazy_static; @@ -62,7 +58,7 @@ impl Tokenizer<'_> { } CcrState::NamedCharacterReference => { if let Some(entity) = self.find_entity() { - self.chars.seek(SeekCur, entity.len() as isize); + self.chars.skip(entity.len()); let c = self.chars.look_ahead(0); if as_attribute diff --git a/src/html5/tokenizer/token.rs b/src/html5/tokenizer/token.rs index b48082892..377551d75 100644 --- a/src/html5/tokenizer/token.rs +++ b/src/html5/tokenizer/token.rs @@ -30,6 +30,31 @@ pub enum Token { Eof, } +impl Token { + pub(crate) fn is_mixed(&self) -> bool { + // Check if there are white characters AND non-white characters in the token + if let Token::Text(value) = self { + let mut found = 0; + + if value.chars().any(|ch| ch.is_ascii_whitespace()) { + found += 1; + } + if value.chars().any(|ch| ch == '\0') { + found += 1; + } + if value + .chars() + .any(|ch| !ch.is_ascii_whitespace() && ch != '\0') + { + found += 1; + } + found > 1 + } else { + false + } + } +} + impl Token { /// Returns true when any of the characters in the token are null pub fn is_null(&self) -> bool { @@ -48,7 +73,11 @@ impl Token { /// Returns true if the text token is empty or only contains whitespace pub fn is_empty_or_white(&self) -> bool { if let Token::Text(value) = self { - ["\u{0009}", "\u{000a}", "\u{000c}", "\u{000d}", "\u{0020}"].contains(&value.as_str()) + if value.is_empty() { + return true; + } + + value.chars().all(|ch| ch.is_ascii_whitespace()) } else { false } diff --git a/src/testing/tree_construction/parser.rs b/src/testing/tree_construction/parser.rs index a5ba70a28..02b72ba0c 100644 --- a/src/testing/tree_construction/parser.rs +++ b/src/testing/tree_construction/parser.rs @@ -310,8 +310,8 @@ fn test(i: Span) -> IResult { /// Trims only a single newline from the string, even if there are multiple newlines present. fn trim_last_newline(s: String) -> String { - if s.ends_with('\n') { - s[..s.len() - 1].to_owned() + if let Some(s) = s.strip_suffix('\n') { + s.to_owned() } else { s }