diff --git a/.gitignore b/.gitignore index ea8c4bf7f..39518e663 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +html5lib-tests diff --git a/Cargo.lock b/Cargo.lock index 4b3524145..585d079f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "aho-corasick" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6748e8def348ed4d14996fa801f4122cd763fff530258cdc03f64b25f89d3a5a" +dependencies = [ + "memchr", +] + [[package]] name = "convert_case" version = "0.4.0" @@ -22,13 +31,36 @@ dependencies = [ ] [[package]] -name = "gosub-browser" +name = "gosub-engine" version = "0.1.0" dependencies = [ "derive_more", + "lazy_static", "phf", + "regex", + "serde", + "serde_derive", + "serde_json", ] +[[package]] +name = "itoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" + +[[package]] +name = "lazy_static" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" + +[[package]] +name = "memchr" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" + [[package]] name = "phf" version = "0.11.2" @@ -104,6 +136,35 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +[[package]] +name = "regex" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81bc1d4caf89fac26a70747fe603c130093b53c773888797a6329091246d651a" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fed1ceff11a1dddaee50c9dc8e4938bd106e9d89ae372f192311e7da498e3b69" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2" + [[package]] name = "rustc_version" version = "0.4.0" @@ -113,12 +174,46 @@ dependencies = [ "semver", ] +[[package]] +name = "ryu" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" + [[package]] name = "semver" version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" +[[package]] +name = "serde" +version = "1.0.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" + +[[package]] +name = "serde_derive" +version = "1.0.183" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", +] + +[[package]] +name = "serde_json" +version = "1.0.105" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "693151e1ac27563d6dbcec9dee9fbd5da8539b20fa14ad3752b2e6d363ace360" +dependencies = [ + "itoa", + "ryu", + "serde", +] + [[package]] name = "siphasher" version = "0.3.10" diff --git a/Cargo.toml b/Cargo.toml index e008cfe9f..8caf39846 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,10 +1,20 @@ [package] -name = "gosub-browser" +name = "gosub-engine" version = "0.1.0" edition = "2021" +authors = ["Joshua Thijssen "] -# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +description = "html5 browser engine" +license = "MIT" +repository = "https://github.com/jaytaph/gosub-browser" +readme = "README.MD" +keywords = ["html5", "parser"] [dependencies] phf = { version = "0.11.2", features = ["macros"] } derive_more = "0.99" +serde = "1.0" +serde_json = "1.0" +serde_derive = "1.0" +regex = "1" +lazy_static = "1.4" \ No newline at end of file diff --git a/src/main.rs b/src/bin/gosub-browser.rs similarity index 74% rename from src/main.rs rename to src/bin/gosub-browser.rs index 807d33f74..0f93e43ca 100644 --- a/src/main.rs +++ b/src/bin/gosub-browser.rs @@ -1,14 +1,11 @@ use std::fs::File; -#[allow(dead_code)] -mod html5_parser; - -use html5_parser::input_stream::Confidence; -use html5_parser::input_stream::{Encoding, InputStream}; -use html5_parser::Html5Parser; +use gosub_engine::html5_parser::input_stream::Confidence; +use gosub_engine::html5_parser::input_stream::{Encoding, InputStream}; +use gosub_engine::html5_parser::parser::Html5Parser; fn main() { - let file = File::open("hello.html").expect("could not open file"); + let file = File::open("../../hello.html").expect("could not open file"); // We just read the stream from a file. It will use UTF8 as the default encoding. let mut stream = InputStream::new(); diff --git a/src/bin/html5test.rs b/src/bin/html5test.rs new file mode 100755 index 000000000..2767ced64 --- /dev/null +++ b/src/bin/html5test.rs @@ -0,0 +1,422 @@ +use std::{env, fs, io}; +use std::collections::HashSet; + +use serde_json::Value; +use gosub_engine::html5_parser::input_stream::InputStream; +use gosub_engine::html5_parser::token_states::{State as TokenState}; +use gosub_engine::html5_parser::tokenizer::{Options, Tokenizer}; +use gosub_engine::html5_parser::token::{Token, TokenTrait, TokenType}; + +extern crate regex; +use regex::Regex; + +#[macro_use] +extern crate serde_derive; + +// These tests are skipped for various reasons. See test_results.md +const SKIP_TESTS: [&str; 1] = [ + ", +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Test { + pub description: String, + pub input: String, + pub output: Vec>, + #[serde(default)] + pub errors: Vec, + #[serde(default)] + pub double_escaped: Option, + #[serde(default)] + pub initial_states: Vec, + pub last_start_tag: Option, +} + +#[derive(Default, Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Error { + pub code: String, + pub line: i64, + pub col: i64, +} + +pub struct TestResults{ + tests: usize, // Number of tests (as defined in the suite) + assertions: usize, // Number of assertions (different combinations of input/output per test) + succeeded: usize, // How many succeeded assertions + failed: usize, // How many failed assertions + failed_position: usize, // How many failed assertions where position is not correct +} + +fn main () -> io::Result<()> { + let default_dir = "./html5lib-tests"; + let dir = env::args().nth(1).unwrap_or(default_dir.to_string()); + + let mut results = TestResults{ + tests: 0, + assertions: 0, + succeeded: 0, + failed: 0, + failed_position: 0, + }; + + for entry in fs::read_dir(dir + "/tokenizer")? { + let entry = entry?; + let path = entry.path(); + + if !path.is_file() || path.extension().unwrap() != "test" { + continue; + } + + let contents = fs::read_to_string(&path)?; + let container = serde_json::from_str(&contents); + if container.is_err() { + continue; + } + let container: Root = container.unwrap(); + + println!("🏃‍♂️ Running {} tests from 🗄️ {:?}", container.tests.len(), path); + + for test in container.tests { + run_token_test(&test, &mut results) + } + } + + println!("🏁 Tests completed: Ran {} tests, {} assertions, {} succeeded, {} failed ({} position failures)", results.tests, results.assertions, results.succeeded, results.failed, results.failed_position); + Ok(()) +} + +fn run_token_test(test: &Test, results: &mut TestResults) +{ + for skip in SKIP_TESTS { + if test.description == skip { + println!("🧪 Skipping test: {}", test.description); + return; + } + } + + println!("🧪 Running test: {}", test.description); + + results.tests += 1; + + // If no initial state is given, assume Data state + let mut states = test.initial_states.clone(); + if states.is_empty() { + states.push(String::from("Data state")); + } + + + for state in states.iter() { + let state= match state.as_str() { + "PLAINTEXT state" => TokenState::PlaintextState, + "RAWTEXT state" => TokenState::RawTextState, + "RCDATA state" => TokenState::RcDataState, + "Script data state" => TokenState::ScriptDataState, + "CDATA section state" => TokenState::CDataSectionState, + "Data state" => TokenState::DataState, + _ => panic!("unknown state found in test: {} ", state) + }; + + let mut is = InputStream::new(); + let input = if test.double_escaped.unwrap_or(false) { + escape(test.input.as_str()) + } else { + test.input.to_string() + }; + + is.read_from_str(input.as_str(), None); + let mut tokenizer = Tokenizer::new(&mut is, Some(Options{ + initial_state: state, + last_start_tag: test.last_start_tag.clone().unwrap_or(String::from("")), + })); + + // If there is no output, still do an (initial) next token so the parser can generate + // errors. + if test.output.is_empty() { + tokenizer.next_token(); + } + + // There can be multiple tokens to match. Make sure we match all of them + for expected_token in test.output.iter() { + let t = tokenizer.next_token(); + if !match_token(t, expected_token, test.double_escaped.unwrap_or(false)) { + results.assertions += 1; + results.failed += 1; + } + } + + if tokenizer.errors.len() != test.errors.len() { + println!("❌ Unexpected errors found (wanted {}, got {}): ", test.errors.len(), tokenizer.errors.len()); + for want_err in &test.errors { + println!(" * Want: '{}' at {}:{}", want_err.code, want_err.line, want_err.col); + } + for got_err in tokenizer.get_errors() { + println!(" * Got: '{}' at {}:{}", got_err.message, got_err.line, got_err.col); + } + results.assertions += 1; + results.failed += 1; + } + + // Check error messages + for error in &test.errors { + match match_error(&tokenizer, &error) { + ErrorResult::Failure => { + results.assertions += 1; + results.failed += 1; + }, + ErrorResult::PositionFailure => { + results.assertions += 1; + results.failed += 1; + results.failed_position += 1; + }, + ErrorResult::Success => { + results.assertions += 1; + results.succeeded += 1; + } + } + } + } + + println!("----------------------------------------"); +} + +#[derive(PartialEq)] +enum ErrorResult { + Success, // Found the correct error + Failure, // Didn't find the error (not even with incorrect position) + PositionFailure, // Found the error, but on a incorrect position +} + +fn match_error(tokenizer: &Tokenizer, expected_err: &Error) -> ErrorResult { + + // Iterate all generated errors to see if we have an exact match + for got_err in tokenizer.get_errors() { + if got_err.message == expected_err.code && got_err.line as i64 == expected_err.line && got_err.col as i64 == expected_err.col { + // Found an exact match + println!("✅ Found parse error '{}' at {}:{}", got_err.message, got_err.line, got_err.col); + + return ErrorResult::Success; + } + } + + // Try and find an error that matches the code, but has a different line/pos. Even though + // it's not always correct, it might be a off-by-one position. + let mut result = ErrorResult::Failure; + for got_err in tokenizer.get_errors() { + if got_err.message == expected_err.code { + if got_err.line as i64 != expected_err.line || got_err.col as i64 != expected_err.col { + // println!("❌ Expected error '{}' at {}:{}", expected_err.code, expected_err.line, expected_err.col); + result = ErrorResult::PositionFailure; + break; + } + } + } + + println!("❌ Expected error '{}' at {}:{}", expected_err.code, expected_err.line, expected_err.col); + + println!(" Parser errors generated:"); + for got_err in tokenizer.get_errors() { + println!(" * '{}' at {}:{}", got_err.message, got_err.line, got_err.col); + } + + result +} + +fn match_token(have: Token, expected: &[Value], double_escaped: bool) -> bool { + let tp = expected.get(0).unwrap(); + + let expected_token_type = match tp.as_str().unwrap() { + "DOCTYPE" => TokenType::DocTypeToken, + "StartTag" => TokenType::StartTagToken, + "EndTag" => TokenType::EndTagToken, + "Comment" => TokenType::CommentToken, + "Character" => TokenType::TextToken, + _ => panic!("unknown output token type {:?}", tp.as_str().unwrap()) + }; + + if have.type_of() != expected_token_type { + println!("❌ Incorrect token type found (want: {:?}, got {:?})", expected_token_type, have.type_of()); + return false; + } + + match have { + Token::DocTypeToken{name, force_quirks, pub_identifier, sys_identifier} => { + if check_match_doctype(expected, name, force_quirks, pub_identifier, sys_identifier).is_err() { + return false; + } + } + Token::StartTagToken{name, attributes, is_self_closing} => { + if check_match_starttag(expected, name, attributes, is_self_closing).is_err() { + return false; + } + } + Token::EndTagToken{name} => { + if check_match_endtag(expected, name, double_escaped).is_err() { + return false; + } + } + Token::CommentToken{value} => { + if check_match_comment(expected, value, double_escaped).is_err() { + return false; + } + } + Token::TextToken{value} => { + if check_match_text(expected, value, double_escaped).is_err() { + return false; + } + }, + Token::EofToken => { + println!("❌ EOF token"); + return false; + } + } + + println!("✅ Test passed"); + true +} + +fn check_match_starttag(expected: &[Value], name: String, attributes: Vec<(String, String)>, is_self_closing: bool) -> Result<(), ()> { + let expected_name = expected.get(1).and_then(|v| v.as_str()).unwrap(); + let expected_attrs = expected.get(2).and_then(|v| v.as_object()); + let expected_self_closing = expected.get(3).and_then(|v| v.as_bool()); + + if expected_name != name { + println!("❌ Incorrect start tag (wanted: '{}', got '{}'", name, expected_name); + return Err(()); + } + + if expected_self_closing.is_some() && expected_self_closing.unwrap() != is_self_closing { + println!("❌ Incorrect start tag (expected selfclosing: {})", !is_self_closing); + return Err(()); + } + + if expected_attrs.is_none() && attributes.len() == 0 { + // No attributes to check + return Ok(()); + } + + // Convert the expected attr to Vec<(string, string)> + let expected_attrs: Vec<(String, String)> = expected_attrs.map_or(Vec::new(), |map| { + map.iter() + .filter_map(|(key, value)| { + value.as_str().map(|v| (key.clone(), v.to_string())) + }) + .collect() + }); + + let set1: HashSet<_> = expected_attrs.iter().collect(); + let set2: HashSet<_> = attributes.iter().collect(); + + if set1 != set2 { + println!("❌ Attributes mismatch"); + + for attr in expected_attrs { + println!(" * Want: '{}={}'", &attr.0, &attr.1); + } + for attr in attributes { + println!(" * Got: '{}={}'", attr.0, attr.1); + } + + return Err(()) + } + + Ok(()) +} + +fn check_match_comment(expected: &[Value], value: String, is_double_escaped: bool) -> Result<(), ()> { + let output_ref = expected.get(1).unwrap().as_str().unwrap(); + let output = if is_double_escaped { escape(output_ref) } else { output_ref.to_string() }; + + if value.ne(&output) { + println!("❌ Incorrect text found in comment token"); + println!(" wanted: '{}', got: '{}'", output, value.as_str()); + return Err(()); + } + + Ok(()) +} + +fn check_match_text(expected: &[Value], value: String, is_double_escaped: bool) -> Result<(), ()> { + let output_ref = expected.get(1).unwrap().as_str().unwrap(); + let output = if is_double_escaped { escape(output_ref) } else { output_ref.to_string() }; + + if value.ne(&output) { + println!("❌ Incorrect text found in text token"); + println!(" wanted: '{}', got: '{}'", output, value.as_str()); + return Err(()); + } + + Ok(()) +} + +fn check_match_endtag(expected: &[Value], name: String, is_double_escaped: bool) -> Result<(), ()> { + let output_ref = expected.get(1).unwrap().as_str().unwrap(); + let output = if is_double_escaped { escape(output_ref) } else { output_ref.to_string() }; + + if name.as_str() != output { + println!("❌ Incorrect end tag"); + return Err(()); + } + Ok(()) +} + +// Check if a given doctype matches the expected result +fn check_match_doctype( + expected: &[Value], + name: Option, + force_quirks: bool, + pub_identifier: Option, + sys_identifier: Option +) -> Result<(), ()> { + let expected_name = expected.get(1).unwrap().as_str(); + let expected_pub = expected.get(2).unwrap().as_str(); + let expected_sys = expected.get(3).unwrap().as_str(); + let expected_quirk = expected.get(4).unwrap().as_bool(); + + if expected_name.is_none() && ! name.is_none() { + println!("❌ Incorrect doctype (no name expected, but got '{}')", name.unwrap()); + return Err(()); + } + if expected_name.is_some() && name.is_none() { + println!("❌ Incorrect doctype (name expected, but got none)"); + return Err(()); + } + if expected_name.is_some() && expected_name != Some(name.clone().unwrap().as_str()) { + println!("❌ Incorrect doctype (wanted name: '{}', got: '{}')", expected_name.unwrap(), name.unwrap().as_str()); + return Err(()); + } + if expected_quirk.is_some() && expected_quirk.unwrap() == force_quirks { + println!("❌ Incorrect doctype (wanted quirk: '{}')", expected_quirk.unwrap()); + return Err(()); + } + if expected_pub != pub_identifier.as_deref() { + println!("❌ Incorrect doctype (wanted pub id: '{:?}', got '{:?}')", expected_pub, pub_identifier); + return Err(()); + } + if expected_sys != sys_identifier.as_deref() { + println!("❌ Incorrect doctype (wanted sys id: '{:?}', got '{:?}')", expected_sys, sys_identifier); + return Err(()); + } + + Ok(()) +} + +fn escape(input: &str) -> String { + let re = Regex::new(r"\\u([0-9a-fA-F]{4})").unwrap(); + re.replace_all(input, |caps: ®ex::Captures| { + let hex_val = u32::from_str_radix(&caps[1], 16).unwrap(); + + // This will also convert surrogates? + unsafe { + char::from_u32_unchecked(hex_val).to_string() + } + }).into_owned() +} \ No newline at end of file diff --git a/src/html5_parser/consume_char_refs.rs b/src/html5_parser/consume_char_refs.rs index 50402bce0..d070d6ca7 100644 --- a/src/html5_parser/consume_char_refs.rs +++ b/src/html5_parser/consume_char_refs.rs @@ -1,285 +1,352 @@ +use crate::html5_parser::parse_errors::ParserError; use crate::html5_parser::token_named_characters::TOKEN_NAMED_CHARS; use crate::html5_parser::token_replacements::TOKEN_REPLACEMENTS; use crate::html5_parser::tokenizer::Tokenizer; +use crate::html5_parser::input_stream::Element; +use crate::read_char; -use super::tokenizer::CHAR_REPLACEMENT; - -// All references are to chapters in https://dev.w3.org/html5/spec-LC/tokenization.html - -impl<'a> Tokenizer<'a> { - // Consumes a character reference and places this in the tokenizer consume buffer - // ref: 8.2.4.69 Tokenizing character references - pub fn consume_character_reference( - &mut self, - additional_allowed_char: Option, - as_attribute: bool, - ) -> Option { - // self.clear_consume_buffer(); - - if as_attribute { - // When we are inside an attribute context, things (will/might) be different. Not sure how yet. - } +extern crate lazy_static; +use lazy_static::lazy_static; +use crate::html5_parser::input_stream::SeekMode::SeekCur; - let c = match self.stream.read_char() { - Some(c) => c, - None => { - return None; - } - }; - - // Characters that aren't allowed - let mut chars = vec![ - crate::html5_parser::tokenizer::CHAR_TAB, - crate::html5_parser::tokenizer::CHAR_LF, - crate::html5_parser::tokenizer::CHAR_FF, - crate::html5_parser::tokenizer::CHAR_SPACE, - '<', - '&', - ]; - - // The name is weird: addiitonal_allowed_chars, but it would be a char that is NOT allowed (?) - if additional_allowed_char.is_some() { - chars.push(additional_allowed_char.unwrap()) - } +use super::tokenizer::CHAR_REPLACEMENT; - if chars.contains(&c) { - self.stream.unread(); - return None; - } +// Different states for the character references +pub enum CcrState { + CharacterReferenceState, + NamedCharacterReferenceState, + AmbiguousAmpersandState, + NumericCharacterReferenceState, + HexadecimalCharacterReferenceStartState, + DecimalCharacterReferenceStartState, + HexadecimalCharacterReferenceState, + DecimalCharacterReferenceState, + NumericalCharacterReferenceEndState, +} - // Consume a number when we found &# - if c == '#' { - self.consume('&'); - self.consume(c); - if self.consume_number().is_err() { - self.stream.unread(); - return None; +macro_rules! consume_temp_buffer { + ($self:expr, $as_attribute:expr) => { + for c in $self.temporary_buffer.clone() { + if $as_attribute { + $self.current_attr_value.push(c); + } else { + $self.consume(c); } - - return Some(self.get_consumed_str()); - } - - // Consume anything else when we found & with another char after (ie: ») - self.stream.unread(); - if self.consume_entity(as_attribute).is_err() { - self.stream.unread(); - return None; } + $self.temporary_buffer.clear(); + }; +} - return Some(self.get_consumed_str()); - } - - // Consume a number like #x1234, #123 etc - fn consume_number(&mut self) -> Result { - let mut str_num = String::new(); - - // Save current position for easy recovery - let cp = self.stream.tell(); +impl<'a> Tokenizer<'a> { + // Consumes a character reference and places this in the tokenizer consume buffer + // ref: 8.2.4.69 Tokenizing character references - // Is the char a 'X' or 'x', then we must try and fetch hex digits, otherwise just 0..9 - let mut is_hex = false; - let hex = match self.stream.look_ahead(0) { - Some(hex) => hex, - None => { - return Err(String::new()); - } - }; + // @TODO: fix additional allowed char + pub fn consume_character_reference(&mut self, _additional_allowed_char: Option, as_attribute: bool) + { + let mut ccr_state = CcrState::CharacterReferenceState; + let mut char_ref_code: Option = Some(0); - if hex == 'x' || hex == 'X' { - is_hex = true; + loop { + match ccr_state { + CcrState::CharacterReferenceState => { + self.temporary_buffer = vec!['&']; + + let c = read_char!(self); + match c { + // Element::Eof => { + // consume_temp_buffer!(self, as_attribute); + // return + // }, + Element::Utf8('A'..='Z') | Element::Utf8('a'..='z') | Element::Utf8('0'..='9') => { + self.stream.unread(); + ccr_state = CcrState::NamedCharacterReferenceState; + }, + Element::Utf8('#') => { + self.temporary_buffer.push(c.utf8()); + ccr_state = CcrState::NumericCharacterReferenceState; + }, + _ => { + consume_temp_buffer!(self, as_attribute); + + self.stream.unread(); + return; + } + } + }, + CcrState::NamedCharacterReferenceState => { + if let Some(entity) = self.find_entity() { + + self.stream.seek(SeekCur, entity.len() as isize); + let c = self.stream.look_ahead(0); + if + as_attribute && + entity.chars().last().unwrap() != ';' && + c.is_utf8() && + (c.utf8() == '=' || c.utf8().is_ascii_alphanumeric()) + { + // for historical reasons, the codepoints should be flushed as is + for c in entity.chars() { + self.temporary_buffer.push(c); + } + + consume_temp_buffer!(self, as_attribute); + return; + } + + let entity_chars = *TOKEN_NAMED_CHARS.get(entity.as_str()).unwrap(); + + // Flush codepoints consumed as character reference + for c in entity_chars.chars() { + if as_attribute { + self.current_attr_value.push(c); + } else { + self.consume(c); + } + } + self.temporary_buffer.clear(); + + if entity.chars().last().unwrap() != ';' { + // We need to return the position where we expected the ';' + self.stream.read_char(); // @TODO: We can't use skip, as this might interfere with EOF stuff (fix it) + self.parse_error(ParserError::MissingSemicolonAfterCharacterReference); + self.stream.unread(); + } + + return; + } - // Consume the 'x' character - let c = match self.stream.read_char() { - Some(c) => c, - None => { - self.stream.seek(cp); - return Err(String::new()); + consume_temp_buffer!(self, as_attribute); + ccr_state = CcrState::AmbiguousAmpersandState; } - }; - - self.consume(c); - }; - - let mut i = 0; - loop { - let c = match self.stream.read_char() { - Some(c) => c, - None => { - self.stream.seek(cp); - return Err(String::new()); + CcrState::AmbiguousAmpersandState => { + let c = read_char!(self); + match c { + // Element::Eof => return, + Element::Utf8('A'..='Z') | Element::Utf8('a'..='z') | Element::Utf8('0'..='9') => { + if as_attribute { + self.current_attr_value.push(c.utf8()); + } else { + self.consume(c.utf8()); + } + }, + Element::Utf8(';') => { + self.parse_error(ParserError::UnknownNamedCharacterReference); + self.stream.unread(); + return; + } + _ => { + self.stream.unread(); + return; + } + } } - }; - - if is_hex && c.is_ascii_hexdigit() { - str_num.push(c); - self.consume(c); - } else if !is_hex && c.is_ascii_digit() { - str_num.push(c); - self.consume(c); - } else { - self.stream.unread(); - break; - } - - i += 1; - } - - // Fetch next character - let c = match self.stream.read_char() { - Some(c) => c, - None => { - self.stream.seek(cp); - return Err(String::new()); - } - }; - - // Next character MUST be ; - if c != ';' { - self.parse_error("expected a ';'"); - self.stream.seek(cp); - return Err(String::new()); - } - - self.consume(c); + CcrState::NumericCharacterReferenceState => { + char_ref_code = Some(0); + + let c = read_char!(self); + match c { + // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEndState, + Element::Utf8('X') | Element::Utf8('x') => { + self.temporary_buffer.push(c.utf8()); + ccr_state = CcrState::HexadecimalCharacterReferenceStartState; + } + _ => { + self.stream.unread(); + ccr_state = CcrState::DecimalCharacterReferenceStartState; + } + } + } + CcrState::HexadecimalCharacterReferenceStartState => { + let c = read_char!(self); + match c { + // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEndState, + Element::Utf8('0'..='9') | Element::Utf8('A'..='F') | Element::Utf8('a'..='f') => { + self.stream.unread(); + ccr_state = CcrState::HexadecimalCharacterReferenceState + } + _ => { + self.parse_error(ParserError::AbsenceOfDigitsInNumericCharacterReference); + consume_temp_buffer!(self, as_attribute); + + self.stream.unread(); + return; + } + } + } + CcrState::DecimalCharacterReferenceStartState => { + let c = read_char!(self); + match c { + Element::Utf8('0'..='9') => { + self.stream.unread(); + ccr_state = CcrState::DecimalCharacterReferenceState; + } + _ => { + self.parse_error(ParserError::AbsenceOfDigitsInNumericCharacterReference); + consume_temp_buffer!(self, as_attribute); + + self.stream.unread(); + return; + } + } + } + CcrState::HexadecimalCharacterReferenceState => { + let c = read_char!(self); + match c { + // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEndState, + Element::Utf8('0'..='9') => { + let i = c.utf8() as u32 - 0x30; + if let Some(value) = char_ref_code { + char_ref_code = value + .checked_mul(16) + .and_then(|mul_result| mul_result.checked_add(i)); + } + } + Element::Utf8('A'..='F') => { + let i = c.utf8() as u32 - 0x37; + if let Some(value) = char_ref_code { + char_ref_code = value + .checked_mul(16) + .and_then(|mul_result| mul_result.checked_add(i)); + } + } + Element::Utf8('a'..='f') => { + let i = c.utf8() as u32 - 0x57; + if let Some(value) = char_ref_code { + char_ref_code = value + .checked_mul(16) + .and_then(|mul_result| mul_result.checked_add(i)); + } + } + Element::Utf8(';') => { + ccr_state = CcrState::NumericalCharacterReferenceEndState; + } + _ => { + self.parse_error(ParserError::MissingSemicolonAfterCharacterReference); + self.stream.unread(); + ccr_state = CcrState::NumericalCharacterReferenceEndState; + } + } + } + CcrState::DecimalCharacterReferenceState => { + let c = read_char!(self); + match c { + // Element::Eof => ccr_state = CcrState::NumericalCharacterReferenceEndState, + Element::Utf8('0'..='9') => { + let i = c.utf8() as u32 - 0x30; + if let Some(value) = char_ref_code { + char_ref_code = value + .checked_mul(10) + .and_then(|mul_result| mul_result.checked_add(i)); + } + } + Element::Utf8(';') => { + ccr_state = CcrState::NumericalCharacterReferenceEndState; + } + _ => { + self.parse_error(ParserError::MissingSemicolonAfterCharacterReference); + self.stream.unread(); + ccr_state = CcrState::NumericalCharacterReferenceEndState; + } + } + } + CcrState::NumericalCharacterReferenceEndState => { + let overflow = char_ref_code.is_none(); + let mut char_ref_code = char_ref_code.unwrap_or(0); + + if char_ref_code == 0 && !overflow { + self.stream.read_char(); + self.parse_error(ParserError::NullCharacterReference); + char_ref_code = CHAR_REPLACEMENT as u32; + } - // If we found ;. we need to check how many digits we have parsed. It needs to be at least 1, - if i == 0 { - self.parse_error("didn't expect #;"); - self.stream.seek(cp); - return Err(String::new()); - } + if char_ref_code > 0x10FFFF || overflow { + self.stream.read_char(); + self.parse_error(ParserError::CharacterReferenceOutsideUnicodeRange); + self.stream.unread(); + char_ref_code = CHAR_REPLACEMENT as u32; + } - // check if we need to replace the character. First convert the number to a uint, and use that - // to check if it exists in the replacements table. - let num = match u32::from_str_radix(&*str_num, if is_hex { 16 } else { 10 }) { - Ok(n) => n, - Err(_) => 0, // lets pretend that an invalid value is set to 0 - }; - - if TOKEN_REPLACEMENTS.contains_key(&num) { - // self.clear_consume_buffer(); - self.consume(*TOKEN_REPLACEMENTS.get(&num).unwrap()); - return Ok(String::new()); - } + if self.is_surrogate(char_ref_code) { + self.stream.read_char(); + self.parse_error(ParserError::SurrogateCharacterReference); + self.stream.unread(); + char_ref_code = CHAR_REPLACEMENT as u32; + } + if self.is_noncharacter(char_ref_code) { + self.stream.read_char(); + self.parse_error(ParserError::NoncharacterCharacterReference); + self.stream.unread(); + // char_ref_code = CHAR_REPLACEMENT as u32; + } + if self.is_control_char(char_ref_code) || char_ref_code == 0x0D { + self.stream.read_char(); + self.stream.read_char(); + self.parse_error(ParserError::ControlCharacterReference); + // self.stream.unread(); + self.stream.unread(); + + if TOKEN_REPLACEMENTS.contains_key(&char_ref_code) { + char_ref_code = *TOKEN_REPLACEMENTS.get(&char_ref_code).unwrap() as u32; + } + } - // Next, check if we are in the 0xD800..0xDFFF or 0x10FFFF range, if so, replace - if (num > 0xD800 && num < 0xDFFF) || (num > 0x10FFFFF) { - self.parse_error("within reserved codepoint range, but replaced"); - // self.clear_consume_buffer(); - self.consume(crate::html5_parser::tokenizer::CHAR_REPLACEMENT); - return Ok(String::new()); - } + self.temporary_buffer = vec![char::from_u32(char_ref_code).unwrap_or(CHAR_REPLACEMENT)]; + consume_temp_buffer!(self, as_attribute); - // Check if it's in a reserved range, in that case, we ignore the data - if self.in_reserved_number_range(num) { - self.parse_error("within reserved codepoint range, ignored"); - // self.clear_consume_buffer(); - return Ok(String::new()); + return; + } + } } - - // self.clear_consume_buffer(); - self.consume(std::char::from_u32(num).unwrap_or(CHAR_REPLACEMENT)); - - return Ok(String::new()); } - // Returns if the given codepoint number is in a reserved range (as defined in - // https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference) - fn in_reserved_number_range(&self, codepoint: u32) -> bool { - if (0x1..=0x0008).contains(&codepoint) - || (0x000E..=0x001F).contains(&codepoint) - || (0x007F..=0x009F).contains(&codepoint) - || (0xFDD0..=0xFDEF).contains(&codepoint) - || (0x000E..=0x001F).contains(&codepoint) - || (0x000E..=0x001F).contains(&codepoint) - || (0x000E..=0x001F).contains(&codepoint) - || (0x000E..=0x001F).contains(&codepoint) - || (0x000E..=0x001F).contains(&codepoint) - || [ - 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, - 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, - 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, - 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF, - ] - .contains(&codepoint) - { - return true; - } - - return false; + pub(crate) fn is_surrogate(&self, num: u32) -> bool + { + num >= 0xD800 && num <= 0xDFFF } - // This will consume an entity that does not start with &# (ie: » &#copy;) - fn consume_entity(&mut self, as_attribute: bool) -> Result { - // Processing is based on the golang.org/x/net/html package - - let mut capture = String::new(); - - loop { - let c = self.stream.read_char(); - match c { - Some(c) => { - capture.push(c); - - // If we captured [azAZ09], just continue the capture - if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' { - continue; - } + pub(crate) fn is_noncharacter(&self, num: u32) -> bool + { + (0xFDD0..=0xFDEF).contains(&num) || [ + 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, + 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, + 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, + 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE, 0x10FFFF, + ].contains(&num) + } - break; - } - None => { - self.parse_error("unexpected end of stream"); - self.consume('&'); - self.consume_string(capture); - return Ok(String::new()); - } - } + pub(crate) fn is_control_char(&self, num: u32) -> bool + { + // White spaces are ok + if [0x0009, 0x000A, 0x000C, 0x000D, 0x0020].contains(&num) { + return false; } - // At this point, we have a consume buffer with the entity name in it. We need to check if it's a known entity - - if capture.len() == 0 { - // If we found nohting (ie: &;) - self.parse_error("expected entity name"); - return Err(String::new()); - - // } else if as_attribute { - // @TODO: implement this - // If we need to consume an entity as an attribute, we need to check if the next character is a valid - // attribute stuff - } else if TOKEN_NAMED_CHARS.contains_key(capture.as_str()) { - // If we found a known entity, we need to replace it - - let entity = TOKEN_NAMED_CHARS.get(capture.as_str()).unwrap(); - self.consume_string((*entity).to_string()); - return Ok(String::new()); - } else if !as_attribute { - // If we found some text, but it's not an entity. We decrease the text until we find something that matches an entity. - let mut max_len = capture.len(); - - // Largest entity is 6 chars. We don't need to check for more - if max_len > 6 { - max_len = 6; - } + return (0x0001..=0x001F).contains(&num) || (0x007F..=0x009F).contains(&num); + } - for j in (1..=max_len).rev() { - let substr: String = capture.chars().take(j).collect(); - if TOKEN_NAMED_CHARS.contains_key(substr.as_str()) { - let entity = TOKEN_NAMED_CHARS.get(substr.as_str()).unwrap(); - self.consume_string((*entity).to_string()); - self.consume_string(capture.chars().skip(j).collect()); - return Ok(String::new()); - } + // Finds the longest entity from the current position in the stream. Returns the entity + // replacement OR None when no entity has been found. + fn find_entity(&mut self) -> Option { + let s= self.stream.look_ahead_slice(*LONGEST_ENTITY_LENGTH); + for i in (0..=s.len()).rev() { + if TOKEN_NAMED_CHARS.contains_key(&s[0..i]) { + // Move forward with the number of chars matching + // self.stream.skip(i); + return Some(String::from(&s[0..i])); } } - - self.consume('&'); - self.consume_string(capture.to_string()); - return Ok(String::new()); + None } } +lazy_static! { + // Returns the longest entity in the TOKEN_NAMED_CHARS map (this could be a const actually) + static ref LONGEST_ENTITY_LENGTH: usize = { + TOKEN_NAMED_CHARS.keys().map(|key| key.len()).max().unwrap_or(0) + }; +} + #[cfg(test)] mod tests { use super::*; @@ -294,7 +361,7 @@ mod tests { let mut is = InputStream::new(); is.read_from_str(input, None); - let mut tok = Tokenizer::new(&mut is); + let mut tok = Tokenizer::new(&mut is, None); let t = tok.next_token(); assert_eq!(expected, t.to_string()); } @@ -304,103 +371,103 @@ mod tests { entity_tests! { // Numbers - entity_0: (" ", "str[\n]") - entity_1: ("�", "str[�]") - entity_2: ("�", "str[�]") - entity_3: ("�", "str[�]") // replace with replacement char - entity_4: ("�", "str[�]") // replace with replacement char - entity_5: ("뻯", "str[뻯]") - entity_6: ("", "str[]") // reserved codepoint - entity_7: ("&#;", "str[&]") - entity_8: ("&;", "str[&;]") - entity_9: ("&", "str[&]") - entity_10: ("", "str[]") // reserved codepoint - entity_11: ("", "str[]") // reserved codepoint - entity_12: ("", "str[]") // reserved codepoint - entity_13: ("", "str[]") // reserved codepoint - entity_14: (" ", "str[\t]") - entity_15: ("", "str[]") // reserved codepoint - entity_16: ("﷐", "str[]") // reserved codepoint + entity_0: (" ", "\n") + entity_1: ("�", "�") + entity_2: ("�", "�") + entity_3: ("�", "�") // replace with replacement char + entity_4: ("�", "�") // replace with replacement char + entity_5: ("뻯", "뻯") + entity_6: ("", "�") // reserved codepoint + entity_7: ("&#;", "&#;") + entity_8: ("&;", "&;") + entity_9: ("&", "&") + entity_10: ("", "�") // reserved codepoint + entity_11: ("", "�") // reserved codepoint + entity_12: ("", "�") // reserved codepoint + entity_13: ("", "�") // reserved codepoint + entity_14: (" ", "\t") + entity_15: ("", "�") // reserved codepoint + entity_16: ("﷐", "�") // reserved codepoint // Entities - entity_100: ("©", "str[©]") - entity_101: ("©Thing;", "str[©Thing;]") - entity_102: ("»", "str[»]") - entity_103: ("«", "str[«]") - entity_104: ("¬", "str[¬]") - entity_105: ("¬it;", "str[¬it;]") - entity_106: ("∉", "str[∉]") - entity_107: ("&fo", "str[&fo]") - entity_108: ("&xxx", "str[&xxx]") - entity_109: ("©", "str[©]") - entity_110: ("© ", "str[© ]") - entity_111: ("©a", "str[©a]") - entity_112: ("©a;", "str[©a;]") - entity_113: ("©", "str[©]") - entity_114: ("©&", "str[©&]") - entity_115: ("©a ", "str[©a ]") - // entity_116: ("©X ", "str[&]") // What should this be? + entity_100: ("©", "©") + entity_101: ("©Thing;", "©Thing;") + entity_102: ("»", "»") + entity_103: ("«", "«") + entity_104: ("¬", "¬") + entity_105: ("¬it;", "¬it;") + entity_106: ("∉", "∉") + entity_107: ("&fo", "&fo") + entity_108: ("&xxx", "&xxx") + entity_109: ("©", "©") + entity_110: ("© ", "© ") + entity_111: ("©a", "©a") + entity_112: ("©a;", "©a;") + entity_113: ("©", "©") + // entity_114: ("©&", "©&") + entity_115: ("©a ", "©a ") + entity_116: ("©X ", "©X ") // ChatGPT generated tests - entity_200: ("©", "str[©]") - entity_201: ("© ", "str[© ]") - entity_202: ("©", "str[©]") - entity_203: ("©", "str[©]") - entity_204: ("<", "str[<]") - entity_205: ("&unknown;", "str[&unknown;]") - entity_206: ("<", "str[<]") - entity_207: ("<", "str[<]") - entity_208: ("&", "str[&]") - entity_209: ("€", "str[€]") - entity_210: (">", "str[>]") - entity_211: ("®", "str[®]") - entity_212: ("®", "str[®]") - entity_213: ("®", "str[®]") - entity_214: (""", "str[\"]") - entity_215: (""", "str[\"]") - entity_216: (""", "str[\"]") - entity_217: ("'", "str[']") - entity_218: ("'", "str[']") - entity_219: ("'", "str[']") - entity_220: ("!", "str[!]") - entity_221: ("!", "str[!]") - entity_222: ("#", "str[#]") - entity_223: ("#", "str[#]") - entity_224: ("$", "str[$]") - entity_225: ("$", "str[$]") - entity_226: ("%", "str[%]") - entity_227: ("%", "str[%]") - entity_228: ("*", "str[*]") - entity_229: ("*", "str[*]") - entity_230: ("+", "str[+]") - entity_231: ("+", "str[+]") - entity_232: (",", "str[,]") - entity_233: (",", "str[,]") - entity_234: ("−", "str[−]") - entity_235: ("-", "str[-]") - entity_236: (".", "str[.]") - entity_237: (".", "str[.]") - entity_238: ("/", "str[/]") - entity_239: ("/", "str[/]") - entity_240: (":", "str[:]") - entity_241: (":", "str[:]") - entity_242: (";", "str[;]") - entity_243: (";", "str[;]") - entity_244: ("=", "str[=]") - entity_245: ("=", "str[=]") - entity_246: ("?", "str[?]") - entity_247: ("?", "str[?]") - entity_248: ("@", "str[@]") - entity_249: ("@", "str[@]") - entity_250: ("©", "str[©]") - entity_251: ("€", "str[€]") - entity_252: ("Ÿ", "str[Ÿ]") - entity_253: ("", "str[]") - entity_254: ("�", "str[�]") - entity_255: ("�", "str[�]") - entity_256: ("&unknownchar;", "str[&unknownchar;]") - entity_257: ("�", "str[�]") - entity_259: (" ", "str[]") + entity_200: ("©", "©") + entity_201: ("© ", "© ") + entity_202: ("©", "©") + entity_203: ("©", "©") + entity_204: ("<", "<") + entity_205: ("&unknown;", "&unknown;") + entity_206: ("<", "<") + entity_207: ("<", "<") + entity_208: ("&", "&") + entity_209: ("€", "€") + entity_210: (">", ">") + entity_211: ("®", "®") + entity_212: ("®", "®") + entity_213: ("®", "®") + entity_214: (""", "\"") + entity_215: (""", "\"") + entity_216: (""", "\"") + entity_217: ("'", "'") + entity_218: ("'", "'") + entity_219: ("'", "'") + entity_220: ("!", "!") + entity_221: ("!", "!") + entity_222: ("#", "#") + entity_223: ("#", "#") + entity_224: ("$", "$") + entity_225: ("$", "$") + entity_226: ("%", "%") + entity_227: ("%", "%") + entity_228: ("*", "*") + entity_229: ("*", "*") + entity_230: ("+", "+") + entity_231: ("+", "+") + entity_232: (",", ",") + entity_233: (",", ",") + entity_234: ("−", "−") + entity_235: ("-", "-") + entity_236: (".", ".") + entity_237: (".", ".") + entity_238: ("/", "/") + entity_239: ("/", "/") + entity_240: (":", ":") + entity_241: (":", ":") + entity_242: (";", ";") + entity_243: (";", ";") + entity_244: ("=", "=") + entity_245: ("=", "=") + entity_246: ("?", "?") + entity_247: ("?", "?") + entity_248: ("@", "@") + entity_249: ("@", "@") + entity_250: ("©", "©") + entity_251: ("€", "€") + entity_252: ("Ÿ", "Ÿ") + entity_253: ("", "") + entity_254: ("�", "�") + entity_255: ("�", "�") + entity_256: ("&unknownchar;", "&unknownchar;") + entity_257: ("�", "�") + entity_259: (" ", "") } -} +} \ No newline at end of file diff --git a/src/html5_parser/input_stream.rs b/src/html5_parser/input_stream.rs index c3eba7c0c..c15f04574 100644 --- a/src/html5_parser/input_stream.rs +++ b/src/html5_parser/input_stream.rs @@ -1,6 +1,7 @@ use std::fs::File; use std::io; use std::io::Read; +use crate::html5_parser::tokenizer::{CHAR_LF, CHAR_CR}; // Encoding defines the way the buffer stream is read, as what defines a "character". #[derive(PartialEq)] @@ -19,15 +20,87 @@ pub enum Confidence { // Irrelevant // There is no content encoding for this stream } +#[derive(PartialEq, Debug, Copy, Clone)] +pub struct Position { + pub offset: usize, + pub line: usize, + pub col: usize, +} + +#[derive(PartialEq, Debug, Copy, Clone)] +pub enum Element { + Utf8(char), // Standard UTF character + Surrogate(u16), // Surrogate character (since they cannot be stored in ) + Eof, // End of stream +} + +impl Element { + pub fn is_eof(&self) -> bool { + match self { + Element::Eof => true, + _ => false, + } + } + + pub fn is_utf8(&self) -> bool { + match self { + Element::Utf8(_) => true, + _ => false, + } + } + + pub fn is_surrogate(&self) -> bool { + match self { + Element::Surrogate(_) => true, + _ => false, + } + } + + pub fn u32(&self) -> u32 { + match self { + Element::Utf8(c) => *c as u32, + Element::Surrogate(c) => *c as u32, + Element::Eof => 0, + } + } + + pub fn utf8(&self) -> char { + match self { + Element::Utf8(c) => *c, + Element::Surrogate(..) => 0x0000 as char, + Element::Eof => 0x0000 as char, + } + } + + pub fn to_string(&self) -> String { + match self { + Element::Utf8(ch) => ch.to_string(), + Element::Surrogate(surrogate) => format!("U+{:04X}", surrogate), // Or some other representation + Element::Eof => "EOF".to_string(), // Or an empty string + } + } +} + // HTML(5) input stream structure pub struct InputStream { - encoding: Encoding, // Current encoding - pub(crate) confidence: Confidence, // How confident are we that this is the correct encoding? - current: usize, // Current offset of the reader - length: usize, // Length (in bytes) of the buffer - buffer: Vec, // Reference to the actual buffer stream in characters - u8_buffer: Vec, // Reference to the actual buffer stream in u8 bytes - // If all things are ok, both buffer and u8_buffer should refer to the same memory location + pub encoding: Encoding, // Current encoding + pub confidence: Confidence, // How confident are we that this is the correct encoding? + + pub position: Position, // Current positions + pub length: usize, // Length (in chars) of the buffer + line_offsets: Vec, // Offsets of the given lines + + buffer: Vec, // Reference to the actual buffer stream in characters + u8_buffer: Vec, // Reference to the actual buffer stream in u8 bytes + // If all things are ok, both buffer and u8_buffer should refer to the same memory location (?) + + pub has_read_eof: bool, // True when we just read an EOF +} + +pub enum SeekMode { + SeekSet, // Seek from the start of the stream + SeekCur, // Seek from the current stream position + SeekEnd, // Seek (backwards) from the end of the stream } impl InputStream { @@ -36,10 +109,16 @@ impl InputStream { InputStream { encoding: Encoding::UTF8, confidence: Confidence::Tentative, - current: 0, + position: Position{ + offset: 0, + line: 1, + col: 1, + }, length: 0, + line_offsets: vec![0], // first line always starts at 0 buffer: Vec::new(), u8_buffer: Vec::new(), + has_read_eof: false, } } @@ -55,25 +134,90 @@ impl InputStream { // Returns true when the stream pointer is at the end of the stream pub fn eof(&self) -> bool { - self.current >= self.length + self.has_read_eof || self.position.offset as usize >= self.length } // Reset the stream reader back to the start pub fn reset(&mut self) { - self.current = 0 + self.position.offset = 0; + self.position.line = 1; + self.position.col = 1; } // Seek explicit offset in the stream (based on chars) - pub fn seek(&mut self, mut off: usize) { - if off > self.length { - off = self.length + pub fn seek(&mut self, mode: SeekMode, offset: isize) { + let abs_offset = match mode { + SeekMode::SeekSet => { + if offset.is_negative() { + 0 + } else { + offset as usize + } + } + SeekMode::SeekCur => { + if offset.is_negative() { + self.position.offset - offset.abs() as usize + } else { + self.position.offset + offset as usize + } + } + SeekMode::SeekEnd => { + // Both -5 and 5 on seek-end do the same thing + if offset.abs() > self.length as isize { + 0 + } else { + self.length - offset.abs() as usize + } + } + }; + + self.position = self.generate_position(abs_offset); + } + + pub fn get_previous_position(&mut self) -> Position { + + // if we are at the begining or the end of the stream, we just return the current position + if self.position.offset == 0 || self.has_read_eof { + return self.position; } - self.current = off + self.generate_position(self.position.offset - 1) + } + + // Generate a new position structure for given offset + fn generate_position(&mut self, abs_offset: usize) -> Position { + let mut abs_offset = abs_offset; + + // Cap to length if we read past the end of the stream + if abs_offset > self.length + 1 { + abs_offset = self.length; + self.has_read_eof = true; + } + + // Detect lines (if needed) + self.read_line_endings_until(abs_offset); + + let mut last_line: usize = 0; + let mut last_offset = self.line_offsets[last_line]; + for i in 0..self.line_offsets.len() { + if self.line_offsets[i] > abs_offset as usize { + break; + } + + last_line = i; + last_offset = self.line_offsets[last_line]; + } + + // Set position values + return Position{ + offset: abs_offset, + line: last_line + 1, + col: abs_offset - last_offset + 1, + } } pub fn tell(&self) -> usize { - self.current + self.position.offset as usize } // Set the given confidence of the input stream encoding @@ -96,20 +240,39 @@ impl InputStream { pub fn force_set_encoding(&mut self, e: Encoding) { match e { Encoding::UTF8 => { - // Convert the u8 buffer into utf8 string - let str_buf = std::str::from_utf8(&self.u8_buffer).unwrap(); + let str_buf; + unsafe { + str_buf = std::str::from_utf8_unchecked(&self.u8_buffer) + .replace("\u{000D}\u{000A}", "\u{000A}") + .replace("\u{000D}", "\u{000A}"); + } // Convert the utf8 string into characters so we can use easy indexing - self.buffer = str_buf.chars().collect(); + self.buffer = vec![]; + for c in str_buf.chars() { + + // // Check if we have a non-bmp character. This means it's above 0x10000 + // let cp = c as u32; + // if cp > 0x10000 && cp <= 0x10FFFF { + // let adjusted = cp - 0x10000; + // let lead = ((adjusted >> 10) & 0x3FF) as u16 + 0xD800; + // let trail = (adjusted & 0x3FF) as u16 + 0xDC00; + // self.buffer.push(Element::Surrogate(lead)); + // self.buffer.push(Element::Surrogate(trail)); + // continue; + // } + + if (0xD800..=0xDFFF).contains(&(c as u32)) { + self.buffer.push(Element::Surrogate(c as u16)); + } else { + self.buffer.push(Element::Utf8(c)); + } + } self.length = self.buffer.len(); } Encoding::ASCII => { // Convert the string into characters so we can use easy indexing. Any non-ascii chars (> 0x7F) are converted to '?' - self.buffer = self - .u8_buffer - .iter() - .map(|&byte| if byte.is_ascii() { byte as char } else { '?' }) - .collect(); + self.buffer = self.normalize_newlines_and_ascii(&self.u8_buffer); self.length = self.buffer.len(); } } @@ -117,12 +280,34 @@ impl InputStream { self.encoding = e; } + fn normalize_newlines_and_ascii(&self, buffer: &Vec) -> Vec { + let mut result = Vec::with_capacity(buffer.len()); + + for i in 0..buffer.len() { + if buffer[i] == CHAR_CR as u8 { + // convert CR to LF, or CRLF to LF + if i + 1 < buffer.len() && buffer[i + 1] == CHAR_LF as u8 { + continue; + } + result.push(Element::Utf8(CHAR_LF)); + } else if buffer[i] >= 0x80 { + // Convert high ascii to ? + result.push(Element::Utf8('?')); + } else { + // everything else is ok + result.push(Element::Utf8(buffer[i] as char)) + } + } + + return result + } + // Populates the current buffer with the contents of given file f pub fn read_from_file(&mut self, mut f: File, e: Option) -> io::Result<()> { // First we read the u8 bytes into a buffer f.read_to_end(&mut self.u8_buffer).expect("uh oh"); self.force_set_encoding(e.unwrap_or(Encoding::UTF8)); - self.current = 0; + self.reset(); Ok(()) } @@ -130,49 +315,93 @@ impl InputStream { pub fn read_from_str(&mut self, s: &str, e: Option) { self.u8_buffer = Vec::from(s.as_bytes()); self.force_set_encoding(e.unwrap_or(Encoding::UTF8)); - self.current = 0; + self.reset(); } // Returns the number of characters left in the buffer pub(crate) fn chars_left(&self) -> usize { - self.length - self.current + self.length - self.position.offset } - // Reads a character and increases the current pointer - pub(crate) fn read_char(&mut self) -> Option { - if self.eof() { - return None; + // Reads a character and increases the current pointer, or read EOF as None + pub(crate) fn read_char(&mut self) -> Element { + // Return none if we already have read EOF + if self.has_read_eof { + return Element::Eof; } - let c = self.buffer[self.current]; - self.current += 1; - - return Some(c); + // If we still can move forward in the stream, move forwards + if self.position.offset < self.length { + let c = self.buffer[self.position.offset].clone(); + self.seek(SeekMode::SeekCur, 1); + return c; + } else { + // otherwise, we have reached the end of the stream + self.has_read_eof = true; + + self.seek(SeekMode::SeekEnd, 0); + + // // This is a kind of dummy position so the end of the files are read correctly. + // self.position = Position{ + // offset: self.position.offset, + // line: self.position.line, + // col: self.position.col, + // }; + + return Element::Eof; + } } pub(crate) fn unread(&mut self) { - if self.current > 1 { - self.current -= 1; + // We already read eof, so "unread" the eof by unsetting the flag + if self.has_read_eof { + self.has_read_eof = false; + return; + } + + // If we can track back from the offset, we can do so + if self.position.offset > 0 { + self.seek(SeekMode::SeekCur, -1); } } + // Looks ahead in the stream and returns len characters + pub(crate) fn look_ahead_slice(&self, len: usize) -> String { + let end_pos = std::cmp::min(self.length, self.position.offset + len); + + let slice = &self.buffer[self.position.offset as usize..end_pos]; + slice.iter().map(|e| e.to_string()).collect() + } + // Looks ahead in the stream, can use an optional index if we want to seek further // (or back) in the stream. - // @TODO: idx can be pos or neg. But self.current is always positive. This clashes. - pub(crate) fn look_ahead(&self, idx: i32) -> Option { - let c = self.current as i32; - + pub(crate) fn look_ahead(&self, offset: usize) -> Element { // Trying to look after the stream - if c + idx > self.length as i32 { - return None; + if self.position.offset + offset >= self.length { + return Element::Eof; } - // Trying to look before the stream - if c + idx < 0 { - return None; - } + self.buffer[self.position.offset + offset] + } + + // Populates the line endings + fn read_line_endings_until(&mut self, abs_offset: usize) { + let mut last_offset = *self.line_offsets.last().unwrap(); + + while last_offset <= abs_offset as usize { + if last_offset >= self.length { + self.line_offsets.push(last_offset + 1); + break; + } + + // Check the next char to see if it's a '\n' + let c = self.buffer[last_offset].clone(); + if c == Element::Utf8('\n') { + self.line_offsets.push(last_offset + 1); + } - Some(self.buffer[(c + idx) as usize]) + last_offset += 1; + } } } @@ -194,36 +423,54 @@ mod test { assert_eq!(is.length, 3); assert_eq!(is.eof(), false); assert_eq!(is.chars_left(), 3); - assert_eq!(is.read_char().unwrap(), 'f'); + assert_eq!(is.read_char().utf8(), 'f'); assert_eq!(is.chars_left(), 2); assert_eq!(is.eof(), false); - assert_eq!(is.read_char().unwrap(), '👽'); + assert_eq!(is.read_char().utf8(), '👽'); assert_eq!(is.eof(), false); assert_eq!(is.chars_left(), 1); - assert_eq!(is.read_char().unwrap(), 'f'); + assert_eq!(is.read_char().utf8(), 'f'); assert_eq!(is.eof(), true); assert_eq!(is.chars_left(), 0); is.reset(); is.set_encoding(Encoding::ASCII); assert_eq!(is.length, 6); - assert_eq!(is.read_char().unwrap(), 'f'); - assert_eq!(is.read_char().unwrap(), '?'); - assert_eq!(is.read_char().unwrap(), '?'); - assert_eq!(is.read_char().unwrap(), '?'); - assert_eq!(is.read_char().unwrap(), '?'); - assert_eq!(is.read_char().unwrap(), 'f'); - assert_eq!(is.read_char(), None); - - is.unread(); - assert_eq!(is.chars_left(), 1); - is.unread(); + assert_eq!(is.read_char().utf8(), 'f'); + assert_eq!(is.read_char().utf8(), '?'); + assert_eq!(is.read_char().utf8(), '?'); + assert_eq!(is.read_char().utf8(), '?'); + assert_eq!(is.read_char().utf8(), '?'); + assert_eq!(is.read_char().utf8(), 'f'); + assert_eq!(is.read_char().is_eof(), true); + + is.unread(); // unread eof + is.unread(); // unread 'f' + is.unread(); // Unread '?' assert_eq!(is.chars_left(), 2); + is.unread(); + assert_eq!(is.chars_left(), 3); is.reset(); assert_eq!(is.chars_left(), 6); is.unread(); assert_eq!(is.chars_left(), 6); + + + is.read_from_str("abc", Some(Encoding::UTF8)); + is.reset(); + assert_eq!(is.read_char().utf8(), 'a'); + is.unread(); + assert_eq!(is.read_char().utf8(), 'a'); + assert_eq!(is.read_char().utf8(), 'b'); + is.unread(); + assert_eq!(is.read_char().utf8(), 'b'); + assert_eq!(is.read_char().utf8(), 'c'); + is.unread(); + assert_eq!(is.read_char().utf8(), 'c'); + assert_eq!(is.read_char().is_eof(), true); + is.unread(); + assert_eq!(is.read_char().is_eof(), true); } #[test] @@ -238,34 +485,178 @@ mod test { assert_eq!(is.is_certain_encoding(), false); } + #[test] + fn test_offsets() { + let mut is = InputStream::new(); + is.read_from_str("abc", Some(Encoding::UTF8)); + assert_eq!(is.position, Position{ offset: 0, line: 1, col: 1}); + assert_eq!('a', is.read_char().utf8()); + assert_eq!(is.position, Position{ offset: 1, line: 1, col: 2}); + assert_eq!('b', is.read_char().utf8()); + assert_eq!(is.position, Position{ offset: 2, line: 1, col: 3}); + assert_eq!('c', is.read_char().utf8()); + assert_eq!(is.position, Position{ offset: 3, line: 1, col: 4}); + assert_eq!(is.read_char().is_eof(), true); + assert_eq!(is.position, Position{ offset: 3, line: 1, col: 4}); + assert_eq!(is.read_char().is_eof(), true); + assert_eq!(is.position, Position{ offset: 3, line: 1, col: 4}); + + + let mut is = InputStream::new(); + is.read_from_str("abc\ndefg\n\nhi\njk\nlmno\n\n\npqrst\nu\nv\nw\n\nxy\nz", Some(Encoding::UTF8)); + assert_eq!(is.length, 40); + + is.seek(SeekMode::SeekSet, 0); + assert_eq!(is.position, Position{ offset: 0, line: 1, col: 1}); + let c = is.read_char(); + assert_eq!('a', c.utf8()); + assert_eq!(is.position, Position{ offset: 1, line: 1, col: 2}); + + is.seek(SeekMode::SeekSet, 7); + assert_eq!(is.position, Position{ offset: 7, line: 2, col: 4}); + assert_eq!(is.chars_left(), 33); + + let c = is.read_char(); + assert_eq!('g', c.utf8()); + assert_eq!(is.position, Position{ offset: 8, line: 2, col: 5}); + + let c = is.read_char(); + assert_eq!('\n', c.utf8()); + assert_eq!(is.position, Position{ offset: 9, line: 3, col: 1}); + + let c = is.read_char(); + assert_eq!('\n', c.utf8()); + assert_eq!(is.position, Position{ offset: 10, line: 4, col: 1}); + + let c = is.read_char(); + assert_eq!('h', c.utf8()); + assert_eq!(is.position, Position{ offset: 11, line: 4, col: 2}); + assert_eq!(is.chars_left(), 29); + + is.reset(); + assert_eq!(is.position, Position{ offset: 0, line: 1, col: 1}); + assert_eq!(is.chars_left(), 40); + + is.seek(SeekMode::SeekSet, 100); + assert_eq!(is.position, Position{ offset: 40, line: 15, col: 2}); + assert_eq!(is.chars_left(), 0); + } + #[test] fn test_seek() { let mut is = InputStream::new(); is.read_from_str("ab👽cd", Some(Encoding::UTF8)); assert_eq!(is.length, 5); assert_eq!(is.chars_left(), 5); - assert_eq!(is.read_char().unwrap(), 'a'); - assert_eq!(is.read_char().unwrap(), 'b'); + assert_eq!(is.read_char().utf8(), 'a'); + assert_eq!(is.read_char().utf8(), 'b'); assert_eq!(is.chars_left(), 3); - is.seek(0); + is.seek(SeekMode::SeekSet, 0); assert_eq!(is.chars_left(), 5); - assert_eq!(is.read_char().unwrap(), 'a'); - assert_eq!(is.read_char().unwrap(), 'b'); + assert_eq!(is.read_char().utf8(), 'a'); + assert_eq!(is.read_char().utf8(), 'b'); assert_eq!(is.chars_left(), 3); - is.seek(3); + is.seek(SeekMode::SeekSet, 3); assert_eq!(is.chars_left(), 2); - assert_eq!(is.read_char().unwrap(), 'c'); - assert_eq!(is.read_char().unwrap(), 'd'); + assert_eq!(is.read_char().utf8(), 'c'); + assert_eq!(is.read_char().utf8(), 'd'); assert_eq!(is.chars_left(), 0); assert_eq!(is.eof(), true); is.reset(); - assert_eq!(is.look_ahead(0).unwrap(), 'a'); - assert_eq!(is.look_ahead(3).unwrap(), 'c'); - assert_eq!(is.look_ahead(1).unwrap(), 'b'); - assert_eq!(is.look_ahead(100), None); - assert_eq!(is.look_ahead(-1), None); - is.seek(4); - assert_eq!(is.look_ahead(-1).unwrap(), 'c'); + assert_eq!(is.look_ahead(0).utf8(), 'a'); + assert_eq!(is.look_ahead(3).utf8(), 'c'); + assert_eq!(is.look_ahead(1).utf8(), 'b'); + assert_eq!(is.look_ahead(100).is_eof(), true); + + is.seek(SeekMode::SeekSet, 0); + assert_eq!(is.look_ahead_slice(1), "a"); + assert_eq!(is.look_ahead_slice(2), "ab"); + assert_eq!(is.look_ahead_slice(3), "ab👽"); + assert_eq!(is.look_ahead_slice(4), "ab👽c"); + assert_eq!(is.look_ahead_slice(5), "ab👽cd"); + assert_eq!(is.look_ahead_slice(6), "ab👽cd"); + assert_eq!(is.look_ahead_slice(100), "ab👽cd"); + + is.seek(SeekMode::SeekSet, 3); + assert_eq!(is.look_ahead_slice(1), "c"); + assert_eq!(is.look_ahead_slice(2), "cd"); + + + is.seek(SeekMode::SeekSet, 0); + assert_eq!(is.position.offset, 0); + + is.seek(SeekMode::SeekSet, 3); + assert_eq!(is.position.offset, 3); + + is.seek(SeekMode::SeekCur, 0); + assert_eq!(is.position.offset, 3); + + is.seek(SeekMode::SeekCur, 1); + assert_eq!(is.position.offset, 4); + + is.seek(SeekMode::SeekCur, -2); + assert_eq!(is.position.offset, 2); + + is.seek(SeekMode::SeekCur, 10); + assert_eq!(is.position.offset, 5); + + is.seek(SeekMode::SeekSet, 100); + assert_eq!(is.position.offset, 5); + + is.seek(SeekMode::SeekSet, -100); + assert_eq!(is.position.offset, 0); + + is.seek(SeekMode::SeekEnd, -100); + assert_eq!(is.position.offset, 0); + } + + #[test] + fn test_eof() { + let mut is = InputStream::new(); + is.read_from_str("abc", Some(Encoding::UTF8)); + assert_eq!(is.length, 3); + assert_eq!(is.chars_left(), 3); + assert_eq!(is.read_char().utf8(), 'a'); + assert_eq!(is.read_char().utf8(), 'b'); + assert_eq!(is.read_char().utf8(), 'c'); + assert_eq!(is.read_char().is_eof(), true); + assert_eq!(is.read_char().is_eof(), true); + assert_eq!(is.read_char().is_eof(), true); + assert_eq!(is.read_char().is_eof(), true); + is.unread(); + assert_eq!(is.read_char().is_eof(), true); + is.unread(); + is.unread(); + assert_eq!(is.read_char().is_eof(), false); + assert_eq!(is.read_char().is_eof(), true); + is.unread(); + is.unread(); + assert_eq!(is.read_char().is_eof(), false); + is.unread(); + is.unread(); + is.unread(); + assert_eq!(is.read_char().utf8(), 'a'); + is.unread(); + assert_eq!(is.read_char().utf8(), 'a'); + is.unread(); + is.unread(); + assert_eq!(is.read_char().utf8(), 'a'); + is.unread(); + is.unread(); + is.unread(); + is.unread(); + is.unread(); + is.unread(); + assert_eq!(is.read_char().utf8(), 'a'); + assert_eq!(is.read_char().utf8(), 'b'); + assert_eq!(is.read_char().utf8(), 'c'); + assert_eq!(is.read_char().is_eof(), true); + is.unread(); + is.unread(); + assert_eq!(is.read_char().utf8(), 'c'); + assert_eq!(is.read_char().is_eof(), true); + is.unread(); + assert_eq!(is.read_char().is_eof(), true); } } diff --git a/src/html5_parser/mod.rs b/src/html5_parser/mod.rs index 6542c7feb..85f9e9466 100644 --- a/src/html5_parser/mod.rs +++ b/src/html5_parser/mod.rs @@ -1,41 +1,13 @@ pub mod input_stream; +pub mod parser; +pub mod tokenizer; +pub mod token; +pub mod token_states; +pub mod parse_errors; + mod consume_char_refs; mod emitter; mod node; -mod token; mod token_named_characters; -mod token_replacements; -mod token_states; -mod tokenizer; - -use input_stream::InputStream; -use node::Node; -use tokenizer::Tokenizer; - -pub struct Html5Parser<'a> { - tokenizer: Tokenizer<'a>, -} - -impl<'a> Html5Parser<'a> { - // Creates a new parser object with the given input stream - pub fn new(stream: &'a mut InputStream) -> Self { - Html5Parser { - tokenizer: Tokenizer::new(stream), - } - } - - // Parses the input stream into a Node tree - pub fn parse(&mut self) -> Node { - // Tokenize stuff - - for _ in 1..=20 { - let t = self.tokenizer.next_token(); - println!("{}", t.to_string()); - } - - let mut n = Node::new("root"); - n.add_child(Node::new("child")); - return n; - } -} +mod token_replacements; \ No newline at end of file diff --git a/src/html5_parser/parse_errors.rs b/src/html5_parser/parse_errors.rs new file mode 100755 index 000000000..aaca71adc --- /dev/null +++ b/src/html5_parser/parse_errors.rs @@ -0,0 +1,107 @@ +pub enum ParserError { + AbruptDoctypePublicIdentifier, + AbruptDoctypeSystemIdentifier, + AbruptClosingOfEmptyComment, + AbsenceOfDigitsInNumericCharacterReference, + CdataInHtmlContent, + CharacterReferenceOutsideUnicodeRange, + ControlCharacterInInputStream, + ControlCharacterReference, + EndTagWithAttributes, + DuplicateAttribute, + EndTagWithTrailingSolidus, + EofBeforeTagName, + EofInCdata, + EofInComment, + EofInDoctype, + EofInScriptHtmlCommentLikeText, + EofInTag, + IncorrectlyClosedComment, + IncorrectlyOpenedComment, + InvalidCharacterSequenceAfterDoctypeName, + InvalidFirstCharacterOfTagName, + MissingAttributeValue, + MissingDoctypeName, + MissingDoctypePublicIdentifier, + MissingDoctypeSystemIdentifier, + MissingEndTagName, + MissingQuoteBeforeDoctypePublicIdentifier, + MissingQuoteBeforeDoctypeSystemIdentifier, + MissingSemicolonAfterCharacterReference, + MissingWhitespaceAfterDoctypePublicKeyword, + MissingWhitespaceAfterDoctypeSystemKeyword, + MissingWhitespaceBeforeDoctypeName, + MissingWhitespaceBetweenAttributes, + MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers, + NestedComment, + NoncharacterCharacterReference, + NoncharacterInInputStream, + NonVoidHtmlElementStartTagWithTrailingSolidus, + NullCharacterReference, + SurrogateCharacterReference, + SurrogateInInputStream, + UnexpectedCharacterAfterDoctypeSystemIdentifier, + UnexpectedCharacterInAttributeName, + UnexpectedCharacterInUnquotedAttributeValue, + UnexpectedEqualsSignBeforeAttributeName, + UnexpectedNullCharacter, + UnexpectedQuestionMarkInsteadOfTagName, + UnexpectedSolidusInTag, + UnknownNamedCharacterReference, +} + +impl ParserError { + pub fn as_str(&self) -> &'static str { + match self { + ParserError::AbruptDoctypePublicIdentifier => "abrupt-doctype-public-identifier", + ParserError::AbruptDoctypeSystemIdentifier => "abrupt-doctype-system-identifier", + ParserError::AbsenceOfDigitsInNumericCharacterReference => "absence-of-digits-in-numeric-character-reference", + ParserError::CdataInHtmlContent => "cdata-in-html-content", + ParserError::CharacterReferenceOutsideUnicodeRange => "character-reference-outside-unicode-range", + ParserError::ControlCharacterInInputStream => "control-character-in-input-stream", + ParserError::ControlCharacterReference => "control-character-reference", + ParserError::EndTagWithAttributes => "end-tag-with-attributes", + ParserError::DuplicateAttribute => "duplicate-attribute", + ParserError::EndTagWithTrailingSolidus => "end-tag-with-trailing-solidus", + ParserError::EofBeforeTagName => "eof-before-tag-name", + ParserError::EofInCdata => "eof-in-cdata", + ParserError::EofInComment => "eof-in-comment", + ParserError::EofInDoctype => "eof-in-doctype", + ParserError::EofInScriptHtmlCommentLikeText => "eof-in-script-html-comment-like-text", + ParserError::EofInTag => "eof-in-tag", + ParserError::IncorrectlyClosedComment => "incorrectly-closed-comment", + ParserError::IncorrectlyOpenedComment => "incorrectly-opened-comment", + ParserError::InvalidCharacterSequenceAfterDoctypeName => "invalid-character-sequence-after-doctype-name", + ParserError::InvalidFirstCharacterOfTagName => "invalid-first-character-of-tag-name", + ParserError::MissingAttributeValue => "missing-attribute-value", + ParserError::MissingDoctypeName => "missing-doctype-name", + ParserError::MissingDoctypePublicIdentifier => "missing-doctype-public-identifier", + ParserError::MissingDoctypeSystemIdentifier => "missing-doctype-system-identifier", + ParserError::MissingEndTagName => "missing-end-tag-name", + ParserError::MissingQuoteBeforeDoctypePublicIdentifier => "missing-quote-before-doctype-public-identifier", + ParserError::MissingQuoteBeforeDoctypeSystemIdentifier => "missing-quote-before-doctype-system-identifier", + ParserError::MissingSemicolonAfterCharacterReference => "missing-semicolon-after-character-reference", + ParserError::MissingWhitespaceAfterDoctypePublicKeyword => "missing-whitespace-after-doctype-public-keyword", + ParserError::MissingWhitespaceAfterDoctypeSystemKeyword => "missing-whitespace-after-doctype-system-keyword", + ParserError::MissingWhitespaceBeforeDoctypeName => "missing-whitespace-before-doctype-name", + ParserError::MissingWhitespaceBetweenAttributes => "missing-whitespace-between-attributes", + ParserError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers => "missing-whitespace-between-doctype-public-and-system-identifiers", + ParserError::NestedComment => "nested-comment", + ParserError::NoncharacterCharacterReference => "noncharacter-character-reference", + ParserError::NoncharacterInInputStream => "noncharacter-in-input-stream", + ParserError::NonVoidHtmlElementStartTagWithTrailingSolidus => "non-void-html-element-start-tag-with-trailing-solidus", + ParserError::NullCharacterReference => "null-character-reference", + ParserError::SurrogateCharacterReference => "surrogate-character-reference", + ParserError::SurrogateInInputStream => "surrogate-in-input-stream", + ParserError::UnexpectedCharacterAfterDoctypeSystemIdentifier => "unexpected-character-after-doctype-system-identifier", + ParserError::UnexpectedCharacterInAttributeName => "unexpected-character-in-attribute-name", + ParserError::UnexpectedCharacterInUnquotedAttributeValue => "unexpected-character-in-unquoted-attribute-value", + ParserError::UnexpectedEqualsSignBeforeAttributeName => "unexpected-equals-sign-before-attribute-name", + ParserError::UnexpectedNullCharacter => "unexpected-null-character", + ParserError::UnexpectedQuestionMarkInsteadOfTagName => "unexpected-question-mark-instead-of-tag-name", + ParserError::UnexpectedSolidusInTag => "unexpected-solidus-in-tag", + ParserError::UnknownNamedCharacterReference => "unknown-named-character-reference", + ParserError::AbruptClosingOfEmptyComment => "abrupt-closing-of-empty-comment", + } + } +} diff --git a/src/html5_parser/parser.rs b/src/html5_parser/parser.rs new file mode 100755 index 000000000..31c899cf7 --- /dev/null +++ b/src/html5_parser/parser.rs @@ -0,0 +1,30 @@ +use crate::html5_parser::input_stream::InputStream; +use crate::html5_parser::node::Node; +use crate::html5_parser::tokenizer::Tokenizer; + +pub struct Html5Parser<'a> { + tokenizer: Tokenizer<'a>, +} + +impl<'a> Html5Parser<'a> { + // Creates a new parser object with the given input stream + pub fn new(stream: &'a mut InputStream) -> Self { + Html5Parser { + tokenizer: Tokenizer::new(stream, None), + } + } + + // Parses the input stream into a Node tree + pub fn parse(&mut self) -> Node { + // Tokenize stuff + + for _ in 1..=20 { + let t = self.tokenizer.next_token(); + println!("{}", t.to_string()); + } + + let mut n = Node::new("root"); + n.add_child(Node::new("child")); + return n; + } +} diff --git a/src/html5_parser/test_results.md b/src/html5_parser/test_results.md new file mode 100755 index 000000000..ab950d32d --- /dev/null +++ b/src/html5_parser/test_results.md @@ -0,0 +1,13 @@ +Almost all token tests (found in html5lib-test/tokenizer) will pass: + +🏁 Tests completed: Ran 6805 tests, 2770 assertions, 2748 succeeded, 22 failed (18 position failures) + +The failing test are due to the fact that rust-lang does not handle surrogate characters (0xD800-0xDFFF) in char values. +These values cannot exists on their own in a valid utf-8 string. + +For instance: + +`, force_quirks: bool, pub_identifier: Option, sys_identifier: Option, @@ -33,6 +35,7 @@ pub enum Token { EofToken, } +// Each token can be displayed as a string impl std::fmt::Display for Token { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { match self { @@ -42,20 +45,20 @@ impl std::fmt::Display for Token { pub_identifier, sys_identifier, } => { - let mut result = format!("<{} ", name); + let mut result = format!("'); + result.push_str(" />"); write!(f, "{}", result) } - Token::CommentToken { value } => write!(f, "", value), + Token::CommentToken { value } => write!(f, "", value), Token::TextToken { value } => write!(f, "{}", value), Token::StartTagToken { name, @@ -83,6 +86,7 @@ pub trait TokenTrait { fn type_of(&self) -> TokenType; } +// Each token implements the TokenTrait and has a type_of that will return the tokentype. impl TokenTrait for Token { fn type_of(&self) -> TokenType { match self { diff --git a/src/html5_parser/token_named_characters.rs b/src/html5_parser/token_named_characters.rs index 335c2287d..61d6123ba 100644 --- a/src/html5_parser/token_named_characters.rs +++ b/src/html5_parser/token_named_characters.rs @@ -2143,97 +2143,97 @@ pub static TOKEN_NAMED_CHARS: phf::Map<&'static str, &'static str> = phf_map! { "yen" => "\u{00A5}", "yuml" => "\u{00FF}", - "nLt;" =>"\u{226A}{20D2}", - "nGt;" =>"\u{226B}{20D2}", - "NotEqualTilde;" =>"\u{2242}{0338}", - "NotGreaterFullEqual;" =>"\u{2267}{0338}", - "NotGreaterGreater;" =>"\u{226B}{0338}", - "NotGreaterSlantEqual;" =>"\u{2A7E}{0338}", - "NotHumpDownHump;" =>"\u{224E}{0338}", - "NotHumpEqual;" =>"\u{224F}{0338}", - "NotLeftTriangleBar;" =>"\u{29CF}{0338}", - "NotLessLess;" =>"\u{226A}{0338}", - "NotLessSlantEqual;" =>"\u{2A7D}{0338}", - "NotNestedGreaterGreater;" =>"\u{2AA2}{0338}", - "NotNestedLessLess;" =>"\u{2AA1}{0338}", - "NotPrecedesEqual;" =>"\u{2AAF}{0338}", - "NotRightTriangleBar;" =>"\u{29D0}{0338}", - "NotSquareSubset;" =>"\u{228F}{0338}", - "NotSquareSuperset;" =>"\u{2290}{0338}", - "NotSubset;" =>"\u{2282}{20D2}", - "NotSucceedsEqual;" =>"\u{2AB0}{0338}", - "NotSucceedsTilde;" =>"\u{227F}{0338}", - "NotSuperset;" =>"\u{2283}{20D2}", - "ThickSpace;" =>"\u{205F}{200A}", - "acE;" =>"\u{223E}{0333}", - "bne;" =>"\u{003D}{20E5}", - "bnequiv;" =>"\u{2261}{20E5}", - "caps;" =>"\u{2229}{FE00}", - "cups;" =>"\u{222A}{FE00}", - "fjlig;" =>"\u{0066}{006A}", - "gesl;" =>"\u{22DB}{FE00}", - "gvertneqq;" =>"\u{2269}{FE00}", - "gvnE;" =>"\u{2269}{FE00}", - "lates;" =>"\u{2AAD}{FE00}", - "lesg;" =>"\u{22DA}{FE00}", - "lvertneqq;" =>"\u{2268}{FE00}", - "lvnE;" =>"\u{2268}{FE00}", - "nGg;" =>"\u{22D9}{0338}", - "nGtv;" =>"\u{226B}{0338}", - "nLl;" =>"\u{22D8}{0338}", - "nLtv;" =>"\u{226A}{0338}", - "nang;" =>"\u{2220}{20D2}", - "napE;" =>"\u{2A70}{0338}", - "napid;" =>"\u{224B}{0338}", - "nbump;" =>"\u{224E}{0338}", - "nbumpe;" =>"\u{224F}{0338}", - "ncongdot;" =>"\u{2A6D}{0338}", - "nedot;" =>"\u{2250}{0338}", - "nesim;" =>"\u{2242}{0338}", - "ngE;" =>"\u{2267}{0338}", - "ngeqq;" =>"\u{2267}{0338}", - "ngeqslant;" =>"\u{2A7E}{0338}", - "nges;" =>"\u{2A7E}{0338}", - "nlE;" =>"\u{2266}{0338}", - "nleqq;" =>"\u{2266}{0338}", - "nleqslant;" =>"\u{2A7D}{0338}", - "nles;" =>"\u{2A7D}{0338}", - "notinE;" =>"\u{22F9}{0338}", - "notindot;" =>"\u{22F5}{0338}", - "nparsl;" =>"\u{2AFD}{20E5}", - "npart;" =>"\u{2202}{0338}", - "npre;" =>"\u{2AAF}{0338}", - "npreceq;" =>"\u{2AAF}{0338}", - "nrarrc;" =>"\u{2933}{0338}", - "nrarrw;" =>"\u{219D}{0338}", - "nsce;" =>"\u{2AB0}{0338}", - "nsubE;" =>"\u{2AC5}{0338}", - "nsubset;" =>"\u{2282}{20D2}", - "nsubseteqq;" =>"\u{2AC5}{0338}", - "nsucceq;" =>"\u{2AB0}{0338}", - "nsupE;" =>"\u{2AC6}{0338}", - "nsupset;" =>"\u{2283}{20D2}", - "nsupseteqq;" =>"\u{2AC6}{0338}", - "nvap;" =>"\u{224D}{20D2}", - "nvge;" =>"\u{2265}{20D2}", - "nvgt;" =>"\u{003E}{20D2}", - "nvle;" =>"\u{2264}{20D2}", - "nvlt;" =>"\u{003C}{20D2}", - "nvltrie;" =>"\u{22B4}{20D2}", - "nvrtrie;" =>"\u{22B5}{20D2}", - "nvsim;" =>"\u{223C}{20D2}", - "race;" =>"\u{223D}{0331}", - "smtes;" =>"\u{2AAC}{FE00}", - "sqcaps;" =>"\u{2293}{FE00}", - "sqcups;" =>"\u{2294}{FE00}", - "varsubsetneq;" =>"\u{228A}{FE00}", - "varsubsetneqq;" =>"\u{2ACB}{FE00}", - "varsupsetneq;" =>"\u{228B}{FE00}", - "varsupsetneqq;" =>"\u{2ACC}{FE00}", - "vnsub;" =>"\u{2282}{20D2}", - "vnsup;" =>"\u{2283}{20D2}", - "vsubnE;" =>"\u{2ACB}{FE00}", - "vsubne;" =>"\u{228A}{FE00}", - "vsupnE;" =>"\u{2ACC}{FE00}", - "vsupne;" =>"\u{228B}{FE00}", + "nLt;" =>"\u{226A}\u{20D2}", + "nGt;" =>"\u{226B}\u{20D2}", + "NotEqualTilde;" =>"\u{2242}\u{0338}", + "NotGreaterFullEqual;" =>"\u{2267}\u{0338}", + "NotGreaterGreater;" =>"\u{226B}\u{0338}", + "NotGreaterSlantEqual;" =>"\u{2A7E}\u{0338}", + "NotHumpDownHump;" =>"\u{224E}\u{0338}", + "NotHumpEqual;" =>"\u{224F}\u{0338}", + "NotLeftTriangleBar;" =>"\u{29CF}\u{0338}", + "NotLessLess;" =>"\u{226A}\u{0338}", + "NotLessSlantEqual;" =>"\u{2A7D}\u{0338}", + "NotNestedGreaterGreater;" =>"\u{2AA2}\u{0338}", + "NotNestedLessLess;" =>"\u{2AA1}\u{0338}", + "NotPrecedesEqual;" =>"\u{2AAF}\u{0338}", + "NotRightTriangleBar;" =>"\u{29D0}\u{0338}", + "NotSquareSubset;" =>"\u{228F}\u{0338}", + "NotSquareSuperset;" =>"\u{2290}\u{0338}", + "NotSubset;" =>"\u{2282}\u{20D2}", + "NotSucceedsEqual;" =>"\u{2AB0}\u{0338}", + "NotSucceedsTilde;" =>"\u{227F}\u{0338}", + "NotSuperset;" =>"\u{2283}\u{20D2}", + "ThickSpace;" =>"\u{205F}\u{200A}", + "acE;" =>"\u{223E}\u{0333}", + "bne;" =>"\u{003D}\u{20E5}", + "bnequiv;" =>"\u{2261}\u{20E5}", + "caps;" =>"\u{2229}\u{FE00}", + "cups;" =>"\u{222A}\u{FE00}", + "fjlig;" =>"\u{0066}\u{006A}", + "gesl;" =>"\u{22DB}\u{FE00}", + "gvertneqq;" =>"\u{2269}\u{FE00}", + "gvnE;" =>"\u{2269}\u{FE00}", + "lates;" =>"\u{2AAD}\u{FE00}", + "lesg;" =>"\u{22DA}\u{FE00}", + "lvertneqq;" =>"\u{2268}\u{FE00}", + "lvnE;" =>"\u{2268}\u{FE00}", + "nGg;" =>"\u{22D9}\u{0338}", + "nGtv;" =>"\u{226B}\u{0338}", + "nLl;" =>"\u{22D8}\u{0338}", + "nLtv;" =>"\u{226A}\u{0338}", + "nang;" =>"\u{2220}\u{20D2}", + "napE;" =>"\u{2A70}\u{0338}", + "napid;" =>"\u{224B}\u{0338}", + "nbump;" =>"\u{224E}\u{0338}", + "nbumpe;" =>"\u{224F}\u{0338}", + "ncongdot;" =>"\u{2A6D}\u{0338}", + "nedot;" =>"\u{2250}\u{0338}", + "nesim;" =>"\u{2242}\u{0338}", + "ngE;" =>"\u{2267}\u{0338}", + "ngeqq;" =>"\u{2267}\u{0338}", + "ngeqslant;" =>"\u{2A7E}\u{0338}", + "nges;" =>"\u{2A7E}\u{0338}", + "nlE;" =>"\u{2266}\u{0338}", + "nleqq;" =>"\u{2266}\u{0338}", + "nleqslant;" =>"\u{2A7D}\u{0338}", + "nles;" =>"\u{2A7D}\u{0338}", + "notinE;" =>"\u{22F9}\u{0338}", + "notindot;" =>"\u{22F5}\u{0338}", + "nparsl;" =>"\u{2AFD}\u{20E5}", + "npart;" =>"\u{2202}\u{0338}", + "npre;" =>"\u{2AAF}\u{0338}", + "npreceq;" =>"\u{2AAF}\u{0338}", + "nrarrc;" =>"\u{2933}\u{0338}", + "nrarrw;" =>"\u{219D}\u{0338}", + "nsce;" =>"\u{2AB0}\u{0338}", + "nsubE;" =>"\u{2AC5}\u{0338}", + "nsubset;" =>"\u{2282}\u{20D2}", + "nsubseteqq;" =>"\u{2AC5}\u{0338}", + "nsucceq;" =>"\u{2AB0}\u{0338}", + "nsupE;" =>"\u{2AC6}\u{0338}", + "nsupset;" =>"\u{2283}\u{20D2}", + "nsupseteqq;" =>"\u{2AC6}\u{0338}", + "nvap;" =>"\u{224D}\u{20D2}", + "nvge;" =>"\u{2265}\u{20D2}", + "nvgt;" =>"\u{003E}\u{20D2}", + "nvle;" =>"\u{2264}\u{20D2}", + "nvlt;" =>"\u{003C}\u{20D2}", + "nvltrie;" =>"\u{22B4}\u{20D2}", + "nvrtrie;" =>"\u{22B5}\u{20D2}", + "nvsim;" =>"\u{223C}\u{20D2}", + "race;" =>"\u{223D}\u{0331}", + "smtes;" =>"\u{2AAC}\u{FE00}", + "sqcaps;" =>"\u{2293}\u{FE00}", + "sqcups;" =>"\u{2294}\u{FE00}", + "varsubsetneq;" =>"\u{228A}\u{FE00}", + "varsubsetneqq;" =>"\u{2ACB}\u{FE00}", + "varsupsetneq;" =>"\u{228B}\u{FE00}", + "varsupsetneqq;" =>"\u{2ACC}\u{FE00}", + "vnsub;" =>"\u{2282}\u{20D2}", + "vnsup;" =>"\u{2283}\u{20D2}", + "vsubnE;" =>"\u{2ACB}\u{FE00}", + "vsubne;" =>"\u{228A}\u{FE00}", + "vsupnE;" =>"\u{2ACC}\u{FE00}", + "vsupne;" =>"\u{228B}\u{FE00}", }; diff --git a/src/html5_parser/token_replacements.rs b/src/html5_parser/token_replacements.rs index 3c5575fa3..80a03567b 100644 --- a/src/html5_parser/token_replacements.rs +++ b/src/html5_parser/token_replacements.rs @@ -2,8 +2,8 @@ // https://dev.w3.org/html5/spec-LC/tokenization.html#consume-a-character-reference // If a character (#0x80; for instance) is found, it must be replaced by the given character pub static TOKEN_REPLACEMENTS: phf::Map = phf::phf_map! { - 0x00_u32 => '\u{FFFD}', - 0x0d_u32 => '\u{000D}', + // 0x00_u32 => '\u{FFFD}', + // 0x0d_u32 => '\u{000D}', 0x80_u32 => '\u{20AC}', 0x81_u32 => '\u{0081}', 0x82_u32 => '\u{201A}', diff --git a/src/html5_parser/token_states.rs b/src/html5_parser/token_states.rs index 7b31e685c..4e5e81304 100644 --- a/src/html5_parser/token_states.rs +++ b/src/html5_parser/token_states.rs @@ -1,5 +1,5 @@ // These are the states in which the tokenizer can be in. -#[derive(Debug)] +#[derive(Debug, Copy, Clone)] pub enum State { DataState, CharacterReferenceInDataState, @@ -24,6 +24,7 @@ pub enum State { ScriptDataEscapeStartDashState, ScriptDataEscapedState, ScriptDataEscapedDashState, + ScriptDataEscapedDashDashState, ScriptDataEscapedLessThanSignState, ScriptDataEscapedEndTagOpenState, ScriptDataEscapedEndTagNameState, @@ -35,6 +36,7 @@ pub enum State { ScriptDataDoubleEscapeEndState, BeforeAttributeNameState, AttributeNameState, + AfterAttributeNameState, BeforeAttributeValueState, AttributeValueDoubleQuotedState, AttributeValueSingleQuotedState, @@ -47,6 +49,10 @@ pub enum State { CommentStartState, CommentStartDashState, CommentState, + CommentLessThanSignState, + CommentLessThanSignBangState, + CommentLessThanSignBangDashState, + CommentLessThanSignBangDashDashState, CommentEndDashState, CommentEndState, CommentEndBangState, @@ -61,10 +67,12 @@ pub enum State { AfterDoctypePublicIdentifierState, BetweenDocTypePublicAndSystemIdentifiersState, AfterDocTypeSystemKeywordState, - BeforeDocTypeSystemIdentifiedState, + BeforeDocTypeSystemIdentifierState, DocTypeSystemIdentifierDoubleQuotedState, DocTypeSystemIdentifierSingleQuotedState, - AfterDocTypeSystemIdentifiedState, + AfterDocTypeSystemIdentifierState, BogusDocTypeState, CDataSectionState, + CDataSectionBracketState, + CDataSectionEndState, } diff --git a/src/html5_parser/tokenizer.rs b/src/html5_parser/tokenizer.rs index 4eab8b94c..4113775e9 100644 --- a/src/html5_parser/tokenizer.rs +++ b/src/html5_parser/tokenizer.rs @@ -1,136 +1,2160 @@ use crate::html5_parser::input_stream::InputStream; +use crate::html5_parser::input_stream::Element; +use crate::html5_parser::input_stream::SeekMode::SeekCur; +use crate::html5_parser::parse_errors::ParserError; use crate::html5_parser::token::Token; use crate::html5_parser::token_states::State; // Constants that are not directly captured as visible chars +pub const CHAR_NUL: char = '\u{0000}'; pub const CHAR_TAB: char = '\u{0009}'; pub const CHAR_LF: char = '\u{000A}'; +pub const CHAR_CR: char = '\u{000D}'; pub const CHAR_FF: char = '\u{000C}'; pub const CHAR_SPACE: char = '\u{0020}'; pub const CHAR_REPLACEMENT: char = '\u{FFFD}'; -// Errors produced by the tokenizer -#[derive(Debug)] -pub enum Error { - NullEncountered, -} - // The tokenizer will read the input stream and emit tokens that can be used by the parser. pub struct Tokenizer<'a> { - pub stream: &'a mut InputStream, // HTML character input stream - pub state: State, // Current state of the tokenizer - pub consumed: Vec, // Current consumed characters for current token - // pub emitter: &'a mut dyn Emitter, // Emitter trait that will emit the tokens during parsing + pub stream: &'a mut InputStream, // HTML character input stream + pub state: State, // Current state of the tokenizer + pub consumed: Vec, // Current consumed characters for current token + pub current_attr_name: String, // Current attribute name that we need to store temporary in case we are parsing attributes + pub current_attr_value: String, // Current attribute value that we need to store temporary in case we are parsing attributes + pub current_attrs: Vec<(String, String)>, // Current attributes + pub current_token: Option, // Token that is currently in the making (if any) + pub temporary_buffer: Vec, // Temporary buffer + pub token_queue: Vec, // Queue of emitted tokens. Needed because we can generate multiple tokens during iteration + pub errors: Vec, // Parse errors (if any) + pub last_start_token: String, // The last emitted start token (or empty if none) +} + +pub struct Options { + pub initial_state: State, // Sets the initial state of the tokenizer. Normally only needed when dealing with tests + pub last_start_tag: String, // Sets the last starting tag in the tokenizer. Normally only needed when dealing with tests +} + +#[macro_export] +macro_rules! read_char { + ($self:expr) => { + { + let mut c = $self.stream.read_char(); + match c { + Element::Surrogate(..) => { + $self.parse_error(ParserError::SurrogateInInputStream); + c = Element::Utf8(CHAR_REPLACEMENT); + } + Element::Utf8(c) if $self.is_control_char(c as u32) => { + $self.parse_error(ParserError::ControlCharacterInInputStream); + } + Element::Utf8(c) if $self.is_noncharacter(c as u32) => { + $self.parse_error(ParserError::NoncharacterInInputStream); + } + _ => {} + } + + c + } + } +} + +// Adds the given character to the current token's value (if applicable) +macro_rules! add_to_token_value { + ($self:expr, $c:expr) => { + match &mut $self.current_token { + Some(Token::CommentToken {value, ..}) => { + value.push($c); + } + _ => {}, + } + } +} + +macro_rules! set_public_identifier { + ($self:expr, $str:expr) => { + match &mut $self.current_token { + Some(Token::DocTypeToken { pub_identifier, ..}) => { + *pub_identifier = Some($str); + } + _ => {}, + } + } +} +macro_rules! add_public_identifier { + ($self:expr, $c:expr) => { + match &mut $self.current_token { + Some(Token::DocTypeToken { pub_identifier, ..}) => { + if let Some(pid) = pub_identifier { + pid.push($c); + } + } + _ => {}, + } + } +} + +macro_rules! set_system_identifier { + ($self:expr, $str:expr) => { + match &mut $self.current_token { + Some(Token::DocTypeToken { sys_identifier, ..}) => { + *sys_identifier = Some($str); + } + _ => {}, + } + } +} +macro_rules! add_system_identifier { + ($self:expr, $c:expr) => { + match &mut $self.current_token { + Some(Token::DocTypeToken { sys_identifier, ..}) => { + if let Some(sid) = sys_identifier { + sid.push($c); + } + } + _ => {}, + } + } +} + +// Adds the given character to the current token's name (if applicable) +macro_rules! add_to_token_name { + ($self:expr, $c:expr) => { + match &mut $self.current_token { + Some(Token::StartTagToken {name, ..}) => { + name.push($c); + } + Some(Token::EndTagToken {name, ..}) => { + name.push($c); + } + Some(Token::DocTypeToken {name, ..}) => { + // Doctype can have an optional name + match name { + Some(ref mut string) => string.push($c), + None => *name = Some($c.to_string()), + } + } + _ => {}, + } + } +} + +// Convert a character to lower case value (assumes character is in A-Z range) +macro_rules! to_lowercase { + // Converts A-Z to a-z + ($c:expr) => { + ((($c) as u8) + 0x20) as char + }; +} + +// Emits the current stored token +macro_rules! emit_current_token { + ($self:expr) => { + match $self.current_token { + None => {}, + _ => { + emit_token!($self, $self.current_token.as_ref().unwrap()); + } + }; + $self.current_token = None; + }; +} + +// Emits the given stored token. It does not have to be stored first. +macro_rules! emit_token { + ($self:expr, $token:expr) => { + // Save the start token name if we are pushing it. This helps us in detecting matching tags. + match $token { + Token::StartTagToken { name, .. } => { + $self.last_start_token = String::from(name); + }, + _ => {} + } + + // match $token { + // Token::EndTagToken { .. } => { + // if !$self.current_attrs.is_empty() { + // $self.parse_error(ParserError::EndTagWithAttributes); + // } + // } + // _ => {} + // } + + // If there is any consumed data, emit this first as a text token + if $self.has_consumed_data() { + $self.token_queue.push(Token::TextToken{ + value: $self.get_consumed_str(), + }); + $self.clear_consume_buffer(); + } + + $self.token_queue.push($token.clone()); + } +} + +// Parser error that defines an error (message) on the given position +#[derive(PartialEq)] +pub struct ParseError { + pub message: String, // Parse message + pub line: usize, // Line number of the error + pub col: usize, // Offset on line of the error + pub offset: usize, // Position of the error on the line } impl<'a> Tokenizer<'a> { - pub fn new(input: &'a mut InputStream /*, emitter: &'a mut dyn Emitter*/) -> Self { + // Creates a new tokenizer with the given inputstream and additional options if any + pub fn new(input: &'a mut InputStream /*, emitter: &'a mut dyn Emitter*/, opts: Option) -> Self { return Tokenizer { stream: input, - state: State::DataState, + state: opts.as_ref().map_or(State::DataState, |o| o.initial_state), + last_start_token: opts.as_ref().map_or(String::new(), |o| o.last_start_tag.clone()), consumed: vec![], - // emitter, + current_token: None, + token_queue: vec![], + current_attr_name: String::new(), + current_attr_value: String::new(), + current_attrs: vec![], + temporary_buffer: vec![], + errors: vec![], }; } // Retrieves the next token from the input stream or Token::EOF when the end is reached - pub(crate) fn next_token(&mut self) -> Token { + pub fn next_token(&mut self) -> Token { + self.consume_stream(); + + if self.token_queue.len() == 0 { + return Token::EofToken{}; + } + + return self.token_queue.remove(0); + } + + // Consumes the input stream. Continues until the stream is completed or a token has been generated. + fn consume_stream(&mut self) { loop { - println!("state: {:?}", self.state); - println!("consumed: {:?}", self.consumed); + // Something is already in the token buffer, so we can return it. + if self.token_queue.len() > 0 { + return + } match self.state { State::DataState => { - let c = match self.stream.read_char() { - Some(c) => c, - None => { - self.parse_error("EOF"); - return Token::EofToken; + let c = read_char!(self); + match c { + Element::Utf8('&') => self.state = State::CharacterReferenceInDataState, + Element::Utf8('<') => self.state = State::TagOpenState, + Element::Utf8(CHAR_NUL) => { + self.consume(c.utf8()); + self.parse_error(ParserError::UnexpectedNullCharacter); + }, + Element::Eof => { + // EOF + if self.has_consumed_data() { + emit_token!(self, Token::TextToken { value: self.get_consumed_str() }); + self.clear_consume_buffer(); + } + emit_token!(self, Token::EofToken); + }, + _ => self.consume(c.utf8()), + } + } + State::CharacterReferenceInDataState => { + _ = self.consume_character_reference(None, false); + self.state = State::DataState; + } + State::RcDataState => { + let c = read_char!(self); + match c { + Element::Utf8('&') => { + self.state = State::CharacterReferenceInRcDataState + }, + Element::Utf8('<') => self.state = State::RcDataLessThanSignState, + Element::Eof => { + if self.has_consumed_data() { + emit_token!(self, Token::TextToken { value: self.get_consumed_str().clone() }); + self.clear_consume_buffer(); + } + emit_token!(self, Token::EofToken); + }, + Element::Utf8(CHAR_NUL) => { + self.consume(CHAR_REPLACEMENT); + self.parse_error(ParserError::UnexpectedNullCharacter); + }, + _ => self.consume(c.utf8()), + } + } + State::CharacterReferenceInRcDataState => { + // consume character reference + _ = self.consume_character_reference(None, false); + self.state = State::RcDataState; + } + State::RawTextState => { + let c = read_char!(self); + match c { + Element::Utf8('<') => self.state = State::RawTextLessThanSignState, + Element::Utf8(CHAR_NUL) => { + self.consume(CHAR_REPLACEMENT); + self.parse_error(ParserError::UnexpectedNullCharacter); + }, + Element::Eof => { + // EOF + if self.has_consumed_data() { + emit_token!(self, Token::TextToken { value: self.get_consumed_str() }); + self.clear_consume_buffer(); + } + emit_token!(self, Token::EofToken); + }, + _ => self.consume(c.utf8()), + } + } + State::ScriptDataState => { + let c = read_char!(self); + match c { + Element::Utf8('<') => self.state = State::ScriptDataLessThenSignState, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + }, + Element::Eof => { + if self.has_consumed_data() { + emit_token!(self, Token::TextToken { value: self.get_consumed_str().clone() }); + self.clear_consume_buffer(); + } + emit_token!(self, Token::EofToken); + }, + _ => self.consume(c.utf8()), + } + } + State::PlaintextState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + }, + Element::Eof => { + if self.has_consumed_data() { + emit_token!(self, Token::TextToken { value: self.get_consumed_str().clone() }); + self.clear_consume_buffer(); + } + emit_token!(self, Token::EofToken); + }, + _ => self.consume(c.utf8()), + } + } + State::TagOpenState => { + let c = read_char!(self); + match c { + Element::Utf8('!') => self.state = State::MarkupDeclarationOpenState, + Element::Utf8('/') => self.state = State::EndTagOpenState, + Element::Utf8(ch @ 'A'..='Z') => { + self.current_token = Some(Token::StartTagToken{ + name: "".into(), + is_self_closing: false, + attributes: vec![], + }); + + add_to_token_name!(self, to_lowercase!(ch)); + self.state = State::TagNameState; + }, + Element::Utf8(ch @ 'a'..='z') => { + self.current_token = Some(Token::StartTagToken{ + name: "".into(), + is_self_closing: false, + attributes: vec![], + }); + + add_to_token_name!(self, ch); + self.state = State::TagNameState; + } + Element::Utf8('?') => { + self.current_token = Some(Token::CommentToken{ + value: "".into(), + }); + self.parse_error(ParserError::UnexpectedQuestionMarkInsteadOfTagName); + self.stream.unread(); + self.state = State::BogusCommentState; } - }; + Element::Eof => { + self.parse_error(ParserError::EofBeforeTagName); + self.consume('<'); + self.state = State::DataState; + }, + _ => { + self.parse_error(ParserError::InvalidFirstCharacterOfTagName); + self.stream.unread(); + self.consume('<'); + self.state = State::DataState; + } + } + } + State::EndTagOpenState => { + let c = read_char!(self); + match c { + Element::Utf8(ch @ 'A'..='Z') => { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + + add_to_token_name!(self, to_lowercase!(ch)); + self.state = State::TagNameState; + }, + Element::Utf8(ch @ 'a'..='z') => { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + + add_to_token_name!(self, ch); + self.state = State::TagNameState; + }, + Element::Utf8('>') => { + self.parse_error(ParserError::MissingEndTagName); + self.state = State::DataState; + }, + Element::Eof => { + self.parse_error(ParserError::EofBeforeTagName); + self.consume('<'); + self.consume('/'); + self.state = State::DataState; + }, + _ => { + self.parse_error(ParserError::InvalidFirstCharacterOfTagName); + + self.current_token = Some(Token::CommentToken{ + value: "".into(), + }); + self.stream.unread(); + self.state = State::BogusCommentState; + } + } + } + State::TagNameState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => self.state = State::BeforeAttributeNameState, + Element::Utf8('/') => self.state = State::SelfClosingStartState, + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + }, + Element::Utf8(ch @ 'A'..='Z') => add_to_token_name!(self, to_lowercase!(ch)), + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_to_token_name!(self, CHAR_REPLACEMENT); + }, + Element::Eof => { + self.parse_error(ParserError::EofInTag); + self.state = State::DataState; + }, + _ => add_to_token_name!(self, c.utf8()), + } + } + State::RcDataLessThanSignState => { + let c = read_char!(self); + match c { + Element::Utf8('/') => { + self.temporary_buffer = vec![]; + self.state = State::RcDataEndTagOpenState; + }, + _ => { + self.consume('<'); + self.stream.unread(); + self.state = State::RcDataState; + }, + } + } + State::RcDataEndTagOpenState => { + let c = read_char!(self); + match c { + Element::Utf8(ch @ 'A'..='Z') => { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + self.temporary_buffer.push(to_lowercase!(ch)); + self.state = State::RcDataEndTagNameState; + }, + Element::Utf8(ch @ 'a'..='z') => { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + self.temporary_buffer.push(ch); + self.state = State::RcDataEndTagNameState; + } + _ => { + self.consume('<'); + self.consume('/'); + self.stream.unread(); + self.state = State::RcDataState; + }, + } + } + State::RcDataEndTagNameState => { + let c = read_char!(self); + + // we use this flag because a lot of matches will actually do the same thing + let mut consume_anything_else = false; match c { - '&' => self.state = State::CharacterReferenceInDataState, - '<' => self.state = State::TagOpenState, - '\u{0000}' => { - self.parse_error("NUL encountered in stream"); + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::BeforeAttributeNameState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('/') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::SelfClosingStartState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('>') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + let s: String = self.temporary_buffer.iter().collect::(); + self.set_name_in_current_token(s); + + self.last_start_token = String::new(); + emit_current_token!(self); + self.state = State::DataState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8(ch @ 'A'..='Z') => { + self.temporary_buffer.push(to_lowercase!(ch)); + } + Element::Utf8(ch @ 'a'..='z') => { + self.temporary_buffer.push(ch); } - _ => self.consume(c), + _ => { + consume_anything_else = true; + }, + } + + if consume_anything_else { + self.consume('<'); + self.consume('/'); + for c in self.temporary_buffer.clone() { + self.consume(c); + } + self.temporary_buffer.clear(); + + self.stream.unread(); + self.state = State::RcDataState; } } - State::CharacterReferenceInDataState => { - // consume character reference - self.consume_character_reference(None, false); - self.state = State::DataState; + State::RawTextLessThanSignState => { + let c = read_char!(self); + match c { + Element::Utf8('/') => { + self.temporary_buffer = vec![]; + self.state = State::RawTextEndTagOpenState; + }, + _ => { + self.consume('<'); + self.stream.unread(); + self.state = State::RawTextState; + }, + } + } + State::RawTextEndTagOpenState => { + let c = read_char!(self); + match c { + Element::Utf8(ch @ 'A'..='Z') => { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + // add_to_token_name!(self, to_lowercase!(ch)); + self.temporary_buffer.push(to_lowercase!(ch)); + self.state = State::RawTextEndTagNameState; + }, + Element::Utf8(ch @ 'a'..='z') => { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + // add_to_token_name!(self, ch); + self.temporary_buffer.push(ch); + self.state = State::RawTextEndTagNameState; + } + _ => { + self.consume('<'); + self.consume('/'); + self.stream.unread(); + self.state = State::RawTextState; + }, + } + } + State::RawTextEndTagNameState => { + let c = read_char!(self); + + // we use this flag because a lot of matches will actually do the same thing + let mut consume_anything_else = false; + + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::BeforeAttributeNameState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('/') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::SelfClosingStartState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('>') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + let s: String = self.temporary_buffer.iter().collect::(); + self.set_name_in_current_token(s); + self.last_start_token = String::new(); + emit_current_token!(self); + self.state = State::DataState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8(ch @ 'A'..='Z') => { + // add_to_token_name!(self, to_lowercase!(ch)); + self.temporary_buffer.push(to_lowercase!(ch)); + } + Element::Utf8(ch @ 'a'..='z') => { + // add_to_token_name!(self, ch); + self.temporary_buffer.push(ch); + } + _ => { + consume_anything_else = true; + }, + } + + if consume_anything_else { + self.consume('<'); + self.consume('/'); + for c in self.temporary_buffer.clone() { + self.consume(c); + } + self.temporary_buffer.clear(); + + self.stream.unread(); + self.state = State::RawTextState; + } + } + State::ScriptDataLessThenSignState => { + let c = read_char!(self); + match c { + Element::Utf8('/') => { + self.temporary_buffer = vec![]; + self.state = State::ScriptDataEndTagOpenState; + }, + Element::Utf8('!') => { + self.consume('<'); + self.consume('!'); + self.state = State::ScriptDataEscapeStartState; + }, + _ => { + self.consume('<'); + self.stream.unread(); + self.state = State::ScriptDataState; + }, + } + } + State::ScriptDataEndTagOpenState => { + let c = read_char!(self); + if c.is_eof() { + self.consume('<'); + self.consume('/'); + self.stream.unread(); + self.state = State::ScriptDataState; + continue; + } + + if c.utf8().is_ascii_alphabetic() { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + + self.stream.unread(); + self.state = State::ScriptDataEndTagNameState; + } else { + self.consume('<'); + self.consume('/'); + self.stream.unread(); + self.state = State::ScriptDataState; + } + } + State::ScriptDataEndTagNameState => { + let c = read_char!(self); + + // we use this flag because a lot of matches will actually do the same thing + let mut consume_anything_else = false; + + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::BeforeAttributeNameState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('/') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::SelfClosingStartState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('>') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + let s: String = self.temporary_buffer.iter().collect::(); + self.set_name_in_current_token(s); + + self.last_start_token = String::new(); + emit_current_token!(self); + self.state = State::DataState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8(ch @ 'A'..='Z') => { + self.temporary_buffer.push(to_lowercase!(ch)); + } + Element::Utf8(ch @ 'a'..='z') => { + self.temporary_buffer.push(ch); + } + _ => { + consume_anything_else = true; + }, + } + + if consume_anything_else { + self.consume('<'); + self.consume('/'); + for c in self.temporary_buffer.clone() { + self.consume(c); + } + self.temporary_buffer.clear(); + + self.stream.unread(); + self.state = State::ScriptDataState; + } + } + State::ScriptDataEscapeStartState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.consume('-'); + self.state = State::ScriptDataEscapeStartDashState; + }, + _ => { + self.stream.unread(); + self.state = State::ScriptDataState; + }, + } + } + State::ScriptDataEscapeStartDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.consume('-'); + self.state = State::ScriptDataEscapedDashDashState; + }, + _ => { + self.stream.unread(); + self.state = State::ScriptDataState; + }, + } + } + State::ScriptDataEscapedState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.consume('-'); + self.state = State::ScriptDataEscapedDashState; + }, + Element::Utf8('<') => { + self.state = State::ScriptDataEscapedLessThanSignState; + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + }, + Element::Eof => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + self.state = State::DataState; + }, + _ => { + self.consume(c.utf8()); + }, + } + } + State::ScriptDataEscapedDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.consume('-'); + self.state = State::ScriptDataEscapedDashDashState; + }, + Element::Utf8('<') => { + self.state = State::ScriptDataEscapedLessThanSignState; + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + self.state = State::ScriptDataEscapedState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + self.state = State::DataState; + }, + _ => { + self.stream.unread(); + self.state = State::ScriptDataEscapedState; + }, + } + } + State::ScriptDataEscapedDashDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.consume('-'); + }, + Element::Utf8('<') => { + self.state = State::ScriptDataEscapedLessThanSignState; + }, + Element::Utf8('>') => { + self.consume('>'); + self.state = State::ScriptDataState; + } + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + self.state = State::ScriptDataEscapedState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + self.state = State::DataState; + }, + _ => { + self.stream.unread(); + self.state = State::ScriptDataEscapedState; + }, + } + } + State::ScriptDataEscapedLessThanSignState => { + let c = read_char!(self); + match c { + Element::Utf8('/') => { + self.temporary_buffer = vec![]; + self.state = State::ScriptDataEscapedEndTagOpenState; + }, + _ => { + if c.is_utf8() && c.utf8().is_ascii_alphabetic() { + self.temporary_buffer = vec![]; + self.consume('<'); + self.stream.unread(); + self.state = State::ScriptDataDoubleEscapeStartState; + continue; + } + // anything else + self.consume('<'); + self.stream.unread(); + self.state = State::ScriptDataEscapedState; + }, + } + } + State::ScriptDataEscapedEndTagOpenState => { + let c = read_char!(self); + + if c.is_utf8() && c.utf8().is_ascii_alphabetic() { + self.current_token = Some(Token::EndTagToken{ + name: "".into(), + }); + + self.stream.unread(); + self.state = State::ScriptDataEscapedEndTagNameState; + continue; + } + + // anything else + self.consume('<'); + self.consume('/'); + self.stream.unread(); + self.state = State::ScriptDataEscapedState; } - State::RcDataState => {} - State::CharacterReferenceInRcDataState => {} - State::RawTextState => {} - State::ScriptDataState => {} - State::PlaintextState => {} - State::TagOpenState => {} - State::EndTagOpenState => {} - State::TagNameState => {} - State::RcDataLessThanSignState => {} - State::RcDataEndTagOpenState => {} - State::RcDataEndTagNameState => {} - State::RawTextLessThanSignState => {} - State::RawTextEndTagOpenState => {} - State::RawTextEndTagNameState => {} - State::ScriptDataLessThenSignState => {} - State::ScriptDataEndTagOpenState => {} - State::ScriptDataEndTagNameState => {} - State::ScriptDataEscapeStartState => {} - State::ScriptDataEscapeStartDashState => {} - State::ScriptDataEscapedState => {} - State::ScriptDataEscapedDashState => {} - State::ScriptDataEscapedLessThanSignState => {} - State::ScriptDataEscapedEndTagOpenState => {} - State::ScriptDataEscapedEndTagNameState => {} - State::ScriptDataDoubleEscapeStartState => {} - State::ScriptDataDoubleEscapedState => {} - State::ScriptDataDoubleEscapedDashState => {} - State::ScriptDataDoubleEscapedDashDashState => {} - State::ScriptDataDoubleEscapedLessThanSignState => {} - State::ScriptDataDoubleEscapeEndState => {} - State::BeforeAttributeNameState => {} - State::AttributeNameState => {} - State::BeforeAttributeValueState => {} - State::AttributeValueDoubleQuotedState => {} - State::AttributeValueSingleQuotedState => {} - State::AttributeValueUnquotedState => {} - State::CharacterReferenceInAttributeValueState => {} - State::AfterAttributeValueQuotedState => {} - State::SelfClosingStartState => {} - State::BogusCommentState => {} - State::MarkupDeclarationOpenState => {} - State::CommentStartState => {} - State::CommentStartDashState => {} - State::CommentState => {} - State::CommentEndDashState => {} - State::CommentEndState => {} - State::CommentEndBangState => {} - State::DocTypeState => {} - State::BeforeDocTypeNameState => {} - State::DocTypeNameState => {} - State::AfterDocTypeNameState => {} - State::AfterDocTypePublicKeywordState => {} - State::BeforeDocTypePublicIdentifierState => {} - State::DocTypePublicIdentifierDoubleQuotedState => {} - State::DocTypePublicIdentifierSingleQuotedState => {} - State::AfterDoctypePublicIdentifierState => {} - State::BetweenDocTypePublicAndSystemIdentifiersState => {} - State::AfterDocTypeSystemKeywordState => {} - State::BeforeDocTypeSystemIdentifiedState => {} - State::DocTypeSystemIdentifierDoubleQuotedState => {} - State::DocTypeSystemIdentifierSingleQuotedState => {} - State::AfterDocTypeSystemIdentifiedState => {} - State::BogusDocTypeState => {} - State::CDataSectionState => {} - } - } - - // return Token::Error{error: Error::EndOfStream, span: String::from("")} + State::ScriptDataEscapedEndTagNameState => { + let c = read_char!(self); + + // we use this flag because a lot of matches will actually do the same thing + let mut consume_anything_else = false; + + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::BeforeAttributeNameState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('/') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + self.state = State::SelfClosingStartState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8('>') => { + if self.is_appropriate_end_token(&self.temporary_buffer) { + let s: String = self.temporary_buffer.iter().collect::(); + self.set_name_in_current_token(s); + + self.last_start_token = String::new(); + emit_current_token!(self); + self.state = State::DataState; + } else { + consume_anything_else = true; + } + }, + Element::Utf8(ch @ 'A'..='Z') => { + self.temporary_buffer.push(to_lowercase!(ch)); + } + Element::Utf8(ch @ 'a'..='z') => { + self.temporary_buffer.push(ch); + } + _ => { + consume_anything_else = true; + }, + } + + if consume_anything_else { + self.consume('<'); + self.consume('/'); + for c in self.temporary_buffer.clone() { + self.consume(c); + } + self.temporary_buffer.clear(); + + self.stream.unread(); + self.state = State::ScriptDataEscapedState; + } + } + State::ScriptDataDoubleEscapeStartState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) | + Element::Utf8('/') | + Element::Utf8('>') => { + if self.temporary_buffer.iter().collect::().eq("script") { + self.state = State::ScriptDataDoubleEscapedState; + } else { + self.state = State::ScriptDataEscapedState; + } + self.consume(c.utf8()); + } + Element::Utf8(ch @ 'A'..='Z') => { + self.temporary_buffer.push(to_lowercase!(ch)); + self.consume(ch); + }, + Element::Utf8(ch @ 'a'..='z') => { + self.temporary_buffer.push(ch); + self.consume(ch); + }, + _ => { + self.stream.unread(); + self.state = State::ScriptDataEscapedState; + } + } + }, + State::ScriptDataDoubleEscapedState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.consume('-'); + self.state = State::ScriptDataDoubleEscapedDashState; + } + Element::Utf8('<') => { + self.consume('<'); + self.state = State::ScriptDataDoubleEscapedLessThanSignState; + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + }, + Element::Eof => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + self.state = State::DataState; + } + _ => self.consume(c.utf8()), + } + } + State::ScriptDataDoubleEscapedDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.state = State::ScriptDataDoubleEscapedDashDashState; + self.consume('-'); + } + Element::Utf8('<') => { + self.state = State::ScriptDataDoubleEscapedLessThanSignState; + self.consume('<'); + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + self.state = State::ScriptDataDoubleEscapedState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + self.state = State::DataState; + } + _ => { + self.consume(c.utf8()); + self.state = State::ScriptDataDoubleEscapedState; + }, + } + } + State::ScriptDataDoubleEscapedDashDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => self.consume('-'), + Element::Utf8('<') => { + self.consume('<'); + self.state = State::ScriptDataDoubleEscapedLessThanSignState; + }, + Element::Utf8('>') => { + self.consume('>'); + self.state = State::ScriptDataState; + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.consume(CHAR_REPLACEMENT); + self.state = State::ScriptDataDoubleEscapedState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInScriptHtmlCommentLikeText); + self.state = State::DataState; + }, + _ => { + self.consume(c.utf8()); + self.state = State::ScriptDataDoubleEscapedState; + }, + } + } + State::ScriptDataDoubleEscapedLessThanSignState => { + let c = read_char!(self); + match c { + Element::Utf8('/') => { + self.temporary_buffer = vec![]; + self.consume('/'); + self.state = State::ScriptDataDoubleEscapeEndState; + } + _ => { + self.stream.unread(); + self.state = State::ScriptDataDoubleEscapedState; + }, + } + } + State::ScriptDataDoubleEscapeEndState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) | + Element::Utf8('/') | + Element::Utf8('>') => { + if self.temporary_buffer.iter().collect::().eq("script") { + self.state = State::ScriptDataEscapedState; + } else { + self.state = State::ScriptDataDoubleEscapedState; + } + self.consume(c.utf8()); + } + Element::Utf8(ch @ 'A'..='Z') => { + self.temporary_buffer.push(to_lowercase!(ch)); + self.consume(ch); + }, + Element::Utf8(ch @ 'a'..='z') => { + self.temporary_buffer.push(ch); + self.consume(ch); + }, + _ => { + self.stream.unread(); + self.state = State::ScriptDataDoubleEscapedState; + } + } + } + State::BeforeAttributeNameState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // Ignore character + }, + Element::Utf8('/') | Element::Utf8('>') | Element::Eof => { + self.stream.unread(); + self.state = State::AfterAttributeNameState; + }, + Element::Utf8('=') => { + self.parse_error(ParserError::UnexpectedEqualsSignBeforeAttributeName); + + self.store_and_clear_current_attribute(); + self.current_attr_name.push(c.utf8()); + + self.state = State::AttributeNameState; + } + _ => { + // Store an existing attribute if any and clear + self.store_and_clear_current_attribute(); + + self.stream.unread(); + self.state = State::AttributeNameState; + }, + } + } + State::AttributeNameState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) | + Element::Utf8('/') | + Element::Utf8('>') | + Element::Eof => { + if self.attr_already_exists() { + self.parse_error(ParserError::DuplicateAttribute); + } + self.stream.unread(); + + self.state = State::AfterAttributeNameState + }, + Element::Utf8('=') => { + if self.attr_already_exists() { + self.parse_error(ParserError::DuplicateAttribute); + } + self.state = State::BeforeAttributeValueState + }, + Element::Utf8(ch @ 'A'..='Z') => { + self.current_attr_name.push(to_lowercase!(ch)); + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.current_attr_name.push(CHAR_REPLACEMENT); + }, + Element::Utf8('"') | Element::Utf8('\'') | Element::Utf8('<') => { + self.parse_error(ParserError::UnexpectedCharacterInAttributeName); + self.current_attr_name.push(c.utf8()); + }, + _ => self.current_attr_name.push(c.utf8()), + } + } + State::AfterAttributeNameState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // Ignore + }, + Element::Utf8('/') => self.state = State::SelfClosingStartState, + Element::Utf8('=') => self.state = State::BeforeAttributeValueState, + Element::Utf8('>') => { + self.store_and_clear_current_attribute(); + self.add_stored_attributes_to_current_token(); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInTag); + self.state = State::DataState; + }, + _ => { + self.store_and_clear_current_attribute(); + self.stream.unread(); + self.state = State::AttributeNameState; + }, + } + }, + State::BeforeAttributeValueState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // Ignore + }, + Element::Utf8('"') => self.state = State::AttributeValueDoubleQuotedState, + Element::Utf8('\'') => { + self.state = State::AttributeValueSingleQuotedState; + } + Element::Utf8('>') => { + self.parse_error(ParserError::MissingAttributeValue); + + self.store_and_clear_current_attribute(); + self.add_stored_attributes_to_current_token(); + emit_current_token!(self); + self.state = State::DataState; + }, + _ => { + self.stream.unread(); + self.state = State::AttributeValueUnquotedState; + }, + } + } + State::AttributeValueDoubleQuotedState => { + let c = read_char!(self); + match c { + Element::Utf8('"') => self.state = State::AfterAttributeValueQuotedState, + Element::Utf8('&') => _ = self.consume_character_reference(Some(Element::Utf8('"')), true), + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.current_attr_value.push(CHAR_REPLACEMENT); + }, + Element::Eof => { + self.parse_error(ParserError::EofInTag); + self.state = State::DataState; + }, + _ => { + self.current_attr_value.push(c.utf8()); + }, + } + } + State::AttributeValueSingleQuotedState => { + let c = read_char!(self); + match c { + Element::Utf8('\'') => self.state = State::AfterAttributeValueQuotedState, + Element::Utf8('&') => _ = self.consume_character_reference(Some(Element::Utf8('\'')), true), + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.current_attr_value.push(CHAR_REPLACEMENT); + }, + Element::Eof => { + self.parse_error(ParserError::EofInTag); + self.state = State::DataState; + }, + _ => { + self.current_attr_value.push(c.utf8()); + }, + } + } + State::AttributeValueUnquotedState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + self.state = State::BeforeAttributeNameState; + }, + Element::Utf8('&') => _ = self.consume_character_reference(Some(Element::Utf8('>')), true), + Element::Utf8('>') => { + self.store_and_clear_current_attribute(); + self.add_stored_attributes_to_current_token(); + emit_current_token!(self); + self.state = State::DataState; + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.current_attr_value.push(CHAR_REPLACEMENT); + }, + Element::Utf8('"') | Element::Utf8('\'') | Element::Utf8('<') | Element::Utf8('=') | Element::Utf8('`') => { + self.parse_error(ParserError::UnexpectedCharacterInUnquotedAttributeValue); + self.current_attr_value.push(c.utf8()); + } + Element::Eof => { + self.parse_error(ParserError::EofInTag); + self.state = State::DataState; + }, + _ => { + self.current_attr_value.push(c.utf8()); + }, + } + + } + // State::CharacterReferenceInAttributeValueState => {} + State::AfterAttributeValueQuotedState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => self.state = State::BeforeAttributeNameState, + Element::Utf8('/') => self.state = State::SelfClosingStartState, + Element::Utf8('>') => { + self.store_and_clear_current_attribute(); + self.add_stored_attributes_to_current_token(); + emit_current_token!(self); + self.state = State::DataState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInTag); + self.state = State::DataState; + }, + _ => { + self.parse_error(ParserError::MissingWhitespaceBetweenAttributes); + self.stream.unread(); + self.state = State::BeforeAttributeNameState; + }, + } + } + State::SelfClosingStartState => { + let c = read_char!(self); + match c { + Element::Utf8('>') => { + self.set_is_closing_in_current_token(true); + self.store_and_clear_current_attribute(); + self.add_stored_attributes_to_current_token(); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInTag); + self.state = State::DataState; + }, + _ => { + self.parse_error(ParserError::UnexpectedSolidusInTag); + self.stream.unread(); + self.state = State::BeforeAttributeNameState; + }, + } + } + State::BogusCommentState => { + let c = read_char!(self); + match c { + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + }, + Element::Eof => { + emit_current_token!(self); + self.state = State::DataState; + }, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_to_token_value!(self, CHAR_REPLACEMENT); + } + _ => { + add_to_token_value!(self, c.utf8()); + }, + } + } + State::MarkupDeclarationOpenState => { + if self.stream.look_ahead_slice(2) == "--" { + self.current_token = Some(Token::CommentToken{ + value: "".into(), + }); + + // Skip the two -- signs + self.stream.seek(SeekCur, 2); + + self.state = State::CommentStartState; + continue; + } + + if self.stream.look_ahead_slice(7).to_uppercase() == "DOCTYPE" { + self.stream.seek(SeekCur, 7); + self.state = State::DocTypeState; + continue; + } + + if self.stream.look_ahead_slice(7) == "[CDATA[" { + self.stream.seek(SeekCur, 7); + + // @TODO: If there is an adjusted current node and it is not an element in the HTML namespace, + // then switch to the CDATA section state. Otherwise, this is a cdata-in-html-content parse error. + // Create a comment token whose data is the "[CDATA[" string. Switch to the bogus comment state. + self.parse_error(ParserError::CdataInHtmlContent); + self.current_token = Some(Token::CommentToken{ + value: "[CDATA[".into(), + }); + + self.state = State::BogusCommentState; + continue; + } + + self.stream.seek(SeekCur, 1); + self.parse_error(ParserError::IncorrectlyOpenedComment); + self.stream.unread(); + self.current_token = Some(Token::CommentToken{ + value: "".into(), + }); + + self.state = State::BogusCommentState; + } + State::CommentStartState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.state = State::CommentStartDashState; + } + Element::Utf8('>') => { + self.parse_error(ParserError::AbruptClosingOfEmptyComment); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.stream.unread(); + self.state = State::CommentState; + }, + } + } + State::CommentStartDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.state = State::CommentEndState; + } + Element::Utf8('>') => { + self.parse_error(ParserError::AbruptClosingOfEmptyComment); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInComment); + emit_current_token!(self); + self.state = State::DataState; + }, + _ => { + add_to_token_value!(self, '-'); + self.stream.unread(); + self.state = State::CommentState; + }, + } + } + State::CommentState => { + let c = read_char!(self); + match c { + Element::Utf8('<') => { + add_to_token_value!(self, c.utf8()); + self.state = State::CommentLessThanSignState; + } + Element::Utf8('-') => self.state = State::CommentEndDashState, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_to_token_value!(self, CHAR_REPLACEMENT); + } + Element::Eof => { + self.parse_error(ParserError::EofInComment); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + add_to_token_value!(self, c.utf8()); + }, + } + } + State::CommentLessThanSignState => { + let c = read_char!(self); + match c { + Element::Utf8('!') => { + add_to_token_value!(self, c.utf8()); + self.state = State::CommentLessThanSignBangState; + }, + Element::Utf8('<') => { + add_to_token_value!(self, c.utf8()); + }, + _ => { + self.stream.unread(); + self.state = State::CommentState; + }, + } + }, + State::CommentLessThanSignBangState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.state = State::CommentLessThanSignBangDashState; + }, + _ => { + self.stream.unread(); + self.state = State::CommentState; + }, + } + }, + State::CommentLessThanSignBangDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.state = State::CommentLessThanSignBangDashDashState; + }, + _ => { + self.stream.unread(); + self.state = State::CommentEndDashState; + }, + } + }, + State::CommentLessThanSignBangDashDashState => { + let c = read_char!(self); + match c { + Element::Eof | Element::Utf8('>') => { + self.stream.unread(); + self.state = State::CommentEndState; + }, + _ => { + self.parse_error(ParserError::NestedComment); + self.stream.unread(); + self.state = State::CommentEndState; + }, + } + }, + State::CommentEndDashState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + self.state = State::CommentEndState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInComment); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + add_to_token_value!(self, '-'); + self.stream.unread(); + self.state = State::CommentState; + }, + } + } + State::CommentEndState => { + let c = read_char!(self); + match c { + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + }, + Element::Utf8('!') => self.state = State::CommentEndBangState, + Element::Utf8('-') => add_to_token_value!(self, '-'), + Element::Eof => { + self.parse_error(ParserError::EofInComment); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + add_to_token_value!(self, '-'); + add_to_token_value!(self, '-'); + self.stream.unread(); + self.state = State::CommentState; + } + } + } + State::CommentEndBangState => { + let c = read_char!(self); + match c { + Element::Utf8('-') => { + add_to_token_value!(self, '-'); + add_to_token_value!(self, '-'); + add_to_token_value!(self, '!'); + + self.state = State::CommentEndDashState; + }, + Element::Utf8('>') => { + self.parse_error(ParserError::IncorrectlyClosedComment); + emit_current_token!(self); + self.state = State::DataState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInComment); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + add_to_token_value!(self, '-'); + add_to_token_value!(self, '-'); + add_to_token_value!(self, '!'); + self.stream.unread(); + self.state = State::CommentState; + } + } + } + State::DocTypeState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => self.state = State::BeforeDocTypeNameState, + Element::Utf8('>') => { + self.stream.unread(); + self.state = State::BeforeDocTypeNameState; + }, + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + + emit_token!(self, Token::DocTypeToken{ + name: None, + force_quirks: true, + pub_identifier: None, + sys_identifier: None, + }); + + self.state = State::DataState; + } + _ => { + self.parse_error(ParserError::MissingWhitespaceBeforeDoctypeName); + self.stream.unread(); + self.state = State::BeforeDocTypeNameState; + } + } + } + State::BeforeDocTypeNameState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // ignore + } + Element::Utf8(ch @ 'A'..='Z') => { + self.current_token = Some(Token::DocTypeToken{ + name: None, + force_quirks: false, + pub_identifier: None, + sys_identifier: None, + }); + + add_to_token_name!(self, to_lowercase!(ch)); + self.state = State::DocTypeNameState; + } + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + self.current_token = Some(Token::DocTypeToken{ + name: None, + force_quirks: false, + pub_identifier: None, + sys_identifier: None, + }); + + add_to_token_name!(self, CHAR_REPLACEMENT); + self.state = State::DocTypeNameState; + }, + Element::Utf8('>') => { + self.parse_error(ParserError::MissingDoctypeName); + emit_token!(self, Token::DocTypeToken{ + name: None, + force_quirks: true, + pub_identifier: None, + sys_identifier: None, + }); + + self.state = State::DataState; + }, + + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + + emit_token!(self, Token::DocTypeToken{ + name: None, + force_quirks: true, + pub_identifier: None, + sys_identifier: None, + }); + + self.state = State::DataState; + } + _ => { + self.current_token = Some(Token::DocTypeToken{ + name: None, + force_quirks: false, + pub_identifier: None, + sys_identifier: None, + }); + + add_to_token_name!(self, c.utf8()); + self.state = State::DocTypeNameState; + } + } + } + State::DocTypeNameState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => self.state = State::AfterDocTypeNameState, + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + } + Element::Utf8(ch @ 'A'..='Z') => add_to_token_name!(self, to_lowercase!(ch)), + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_to_token_name!(self, CHAR_REPLACEMENT); + }, + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => add_to_token_name!(self, c.utf8()), + } + } + State::AfterDocTypeNameState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // ignore + } + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.stream.unread(); + if self.stream.look_ahead_slice(6).to_uppercase() == "PUBLIC" { + self.stream.seek(SeekCur, 6); + self.state = State::AfterDocTypePublicKeywordState; + continue; + } + if self.stream.look_ahead_slice(6).to_uppercase() == "SYSTEM" { + self.stream.seek(SeekCur, 6); + self.state = State::AfterDocTypeSystemKeywordState; + continue; + } + // Make sure the parser is on the correct position again since we just + // unread the character + self.stream.seek(SeekCur, 1); + self.parse_error(ParserError::InvalidCharacterSequenceAfterDoctypeName); + self.stream.seek(SeekCur, -1); + self.set_quirks_mode(true); + self.state = State::BogusDocTypeState; + } + } + } + State::AfterDocTypePublicKeywordState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => self.state = State::BeforeDocTypePublicIdentifierState, + Element::Utf8('"') => { + self.parse_error(ParserError::MissingWhitespaceAfterDoctypePublicKeyword); + set_public_identifier!(self, String::new()); + self.state = State::DocTypePublicIdentifierDoubleQuotedState; + } + Element::Utf8('\'') => { + self.parse_error(ParserError::MissingWhitespaceAfterDoctypePublicKeyword); + set_public_identifier!(self, String::new()); + self.state = State::DocTypePublicIdentifierSingleQuotedState; + } + Element::Utf8('>') => { + self.parse_error(ParserError::MissingDoctypePublicIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.parse_error(ParserError::MissingQuoteBeforeDoctypePublicIdentifier); + self.stream.unread(); + self.set_quirks_mode(true); + self.state = State::BogusDocTypeState; + } + } + } + State::BeforeDocTypePublicIdentifierState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // ignore + }, + Element::Utf8('"') => { + set_public_identifier!(self, String::new()); + self.state = State::DocTypePublicIdentifierDoubleQuotedState; + } + Element::Utf8('\'') => { + set_public_identifier!(self, String::new()); + self.state = State::DocTypePublicIdentifierSingleQuotedState; + } + Element::Utf8('>') => { + self.parse_error(ParserError::MissingDoctypePublicIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.stream.unread(); + self.parse_error(ParserError::MissingQuoteBeforeDoctypePublicIdentifier); + self.set_quirks_mode(true); + self.state = State::BogusDocTypeState; + } + } + } + State::DocTypePublicIdentifierDoubleQuotedState => { + let c = read_char!(self); + match c { + Element::Utf8('"') => self.state = State::AfterDoctypePublicIdentifierState, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_public_identifier!(self, CHAR_REPLACEMENT); + } + Element::Utf8('>') => { + self.parse_error(ParserError::AbruptDoctypePublicIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => add_public_identifier!(self, c.utf8()), + } + } + State::DocTypePublicIdentifierSingleQuotedState => { + let c = read_char!(self); + match c { + Element::Utf8('\'') => self.state = State::AfterDoctypePublicIdentifierState, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_public_identifier!(self, CHAR_REPLACEMENT); + } + Element::Utf8('>') => { + self.parse_error(ParserError::AbruptDoctypePublicIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => add_public_identifier!(self, c.utf8()), + } + } + State::AfterDoctypePublicIdentifierState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => self.state = State::BetweenDocTypePublicAndSystemIdentifiersState, + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + } + Element::Utf8('"') => { + self.parse_error(ParserError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierDoubleQuotedState; + } + Element::Utf8('\'') => { + self.parse_error(ParserError::MissingWhitespaceBetweenDoctypePublicAndSystemIdentifiers); + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierSingleQuotedState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.parse_error(ParserError::MissingQuoteBeforeDoctypeSystemIdentifier); + self.stream.unread(); + self.set_quirks_mode(true); + self.state = State::BogusDocTypeState; + } + } + } + State::BetweenDocTypePublicAndSystemIdentifiersState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // ignore + }, + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + } + Element::Utf8('"') => { + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierDoubleQuotedState; + } + Element::Utf8('\'') => { + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierSingleQuotedState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.parse_error(ParserError::MissingQuoteBeforeDoctypeSystemIdentifier); + self.stream.unread(); + self.set_quirks_mode(true); + self.state = State::BogusDocTypeState; + } + } + } + State::AfterDocTypeSystemKeywordState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => self.state = State::BeforeDocTypeSystemIdentifierState, + Element::Utf8('"') => { + self.parse_error(ParserError::MissingWhitespaceAfterDoctypeSystemKeyword); + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierDoubleQuotedState; + } + Element::Utf8('\'') => { + self.parse_error(ParserError::MissingWhitespaceAfterDoctypeSystemKeyword); + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierSingleQuotedState; + } + Element::Utf8('>') => { + self.parse_error(ParserError::MissingDoctypeSystemIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.parse_error(ParserError::MissingQuoteBeforeDoctypeSystemIdentifier); + self.stream.unread(); + self.set_quirks_mode(true); + self.state = State::BogusDocTypeState; + } + } + } + State::BeforeDocTypeSystemIdentifierState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // ignore + }, + Element::Utf8('"') => { + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierDoubleQuotedState; + } + Element::Utf8('\'') => { + set_system_identifier!(self, String::new()); + self.state = State::DocTypeSystemIdentifierSingleQuotedState; + } + Element::Utf8('>') => { + self.parse_error(ParserError::MissingDoctypeSystemIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.parse_error(ParserError::MissingQuoteBeforeDoctypeSystemIdentifier); + self.stream.unread(); + self.set_quirks_mode(true); + self.state = State::BogusDocTypeState; + } + } + } + State::DocTypeSystemIdentifierDoubleQuotedState => { + let c = read_char!(self); + match c { + Element::Utf8('"') => self.state = State::AfterDocTypeSystemIdentifierState, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_system_identifier!(self, CHAR_REPLACEMENT); + } + Element::Utf8('>') => { + self.parse_error(ParserError::AbruptDoctypeSystemIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => add_system_identifier!(self, c.utf8()), + } + + } + State::DocTypeSystemIdentifierSingleQuotedState => { + let c = read_char!(self); + match c { + Element::Utf8('\'') => self.state = State::AfterDocTypeSystemIdentifierState, + Element::Utf8(CHAR_NUL) => { + self.parse_error(ParserError::UnexpectedNullCharacter); + add_system_identifier!(self, CHAR_REPLACEMENT); + } + Element::Utf8('>') => { + self.parse_error(ParserError::AbruptDoctypeSystemIdentifier); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => add_system_identifier!(self, c.utf8()), + } + + } + State::AfterDocTypeSystemIdentifierState => { + let c = read_char!(self); + match c { + Element::Utf8(CHAR_TAB) | + Element::Utf8(CHAR_LF) | + Element::Utf8(CHAR_FF) | + Element::Utf8(CHAR_SPACE) => { + // ignore + }, + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + } + Element::Eof => { + self.parse_error(ParserError::EofInDoctype); + self.set_quirks_mode(true); + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + self.parse_error(ParserError::UnexpectedCharacterAfterDoctypeSystemIdentifier); + self.stream.unread(); + self.state = State::BogusDocTypeState; + } + } + + } + State::BogusDocTypeState => { + let c = read_char!(self); + match c { + Element::Utf8('>') => { + emit_current_token!(self); + self.state = State::DataState; + } + Element::Utf8(CHAR_NUL) => self.parse_error(ParserError::UnexpectedNullCharacter), + Element::Eof => { + emit_current_token!(self); + self.state = State::DataState; + } + _ => { + // ignore + } + } + } + State::CDataSectionState => { + let c = read_char!(self); + match c { + Element::Utf8(']') => { + self.state = State::CDataSectionBracketState; + } + Element::Eof => { + self.parse_error(ParserError::EofInCdata); + emit_current_token!(self); + self.state = State::DataState; + }, + _ => self.consume(c.utf8()), + } + }, + State::CDataSectionBracketState => { + let c = read_char!(self); + match c { + Element::Utf8(']') => self.state = State::CDataSectionEndState, + _ => { + self.consume(']'); + self.stream.unread(); + self.state = State::CDataSectionState; + } + } + }, + State::CDataSectionEndState => { + let c = read_char!(self); + match c { + Element::Utf8(']') => self.consume(']'), + Element::Utf8('>') => self.state = State::DataState, + _ => { + self.consume(']'); + self.consume(']'); + self.stream.unread(); + self.state = State::CDataSectionState; + } + } + } + _ => { + panic!("state {:?} not implemented", self.state); + } + } + } } // Consumes the given char @@ -140,16 +2164,27 @@ impl<'a> Tokenizer<'a> { } // Consumes the given string - pub(crate) fn consume_string(&mut self, s: String) { + pub(crate) fn consume_string(&mut self, s: &str) { // Add c to the current token data for c in s.chars() { self.consumed.push(c) } } + // Return true when the given end_token matches the stored start token (ie: 'table' matches when last_start_token = 'table') + fn is_appropriate_end_token(&self, end_token: &Vec) -> bool { + let s: String = end_token.iter().collect(); + self.last_start_token == s + } + // Return the consumed string as a String pub fn get_consumed_str(&self) -> String { - self.consumed.iter().collect() + return self.consumed.iter().collect(); + } + + // Returns true if there is anything in the consume buffer + pub fn has_consumed_data(&self) -> bool { + return self.consumed.len() > 0; } // Clears the current consume buffer @@ -157,73 +2192,127 @@ impl<'a> Tokenizer<'a> { self.consumed.clear() } + // Return the list of current parse errors + pub fn get_errors(&self) -> &Vec { + &self.errors + } + // Creates a parser log error message - pub(crate) fn parse_error(&mut self, _str: &str) { + pub(crate) fn parse_error(&mut self, error: ParserError) { + + // The previous position is where the error occurred + let pos = self.stream.get_previous_position(); + + let mut already_exists= false; + for err in &self.errors { + if err.line == pos.line && err.col == pos.col && err.message == error.as_str().to_string() { + already_exists = true; + } + } + + // Don't add when this error already exists (for this exact position) + if already_exists { + // self.stream.seek(SeekCur, 1); + return + } + // Add to parse log - println!("parse_error: {}", _str) - } -} + self.errors.push(ParseError{ + message: error.as_str().to_string(), + line: pos.line, + col: pos.col, + offset: pos.offset, + }); -#[cfg(test)] -mod tests { - use super::*; - use crate::html5_parser::token::{Token, TokenTrait, TokenType}; + // self.stream.seek(SeekCur, 1); + } - #[test] - fn test_tokens() { - let t = Token::CommentToken { - value: String::from("this is a comment"), - }; - assert_eq!("comment[this is a comment]", t.to_string()); + // Set is_closing_tag in current token + fn set_is_closing_in_current_token(&mut self, is_closing: bool) { + match &mut self.current_token.as_mut().unwrap() { + Token::EndTagToken { .. } => { + self.parse_error(ParserError::EndTagWithTrailingSolidus); + } + Token::StartTagToken { is_self_closing, .. } => { + *is_self_closing = is_closing; + } + _ => {} + } + } - let t = Token::TextToken { - value: String::from("this is a string"), - }; - assert_eq!("str[this is a string]", t.to_string()); + // Set force_quirk mode in current token + fn set_quirks_mode(&mut self, quirky: bool) { + match &mut self.current_token.as_mut().unwrap() { + Token::DocTypeToken { force_quirks, .. } => { + *force_quirks = quirky; + } + _ => {} + } + } - let t = Token::StartTagToken { - name: String::from("tag"), - is_self_closing: true, - attributes: Vec::new(), - }; - assert_eq!("starttag[]", t.to_string()); - let t = Token::StartTagToken { - name: String::from("tag"), - is_self_closing: false, - attributes: Vec::new(), - }; - assert_eq!("starttag[]", t.to_string()); + // Adds a new attribute to the current token + fn set_add_attribute_to_current_token(&mut self, name: String, value: String) { + match &mut self.current_token.as_mut().unwrap() { + Token::StartTagToken { attributes, .. } => { + attributes.push( + (name.clone(), value.clone()) + ); + } + _ => {} + } - let t = Token::EndTagToken { - name: String::from("tag"), - }; - assert_eq!("endtag[]", t.to_string()); + self.current_attr_name.clear() + } - let t = Token::DocTypeToken { - name: String::from("html"), - force_quirks: true, - pub_identifier: Option::from(String::from("foo")), - sys_identifier: Option::from(String::from("bar")), - }; - assert_eq!("doctype[]", t.to_string()); + // Sets the given name into the current token + fn set_name_in_current_token(&mut self, new_name: String) { + match &mut self.current_token.as_mut().unwrap() { + Token::StartTagToken { name, .. } => { + *name = new_name.clone(); + }, + Token::EndTagToken { name, .. } => { + *name = new_name.clone(); + }, + _ => panic!("trying to set the name of a non start/end tag token") + } } - #[test] - fn test_tokenizer() { - let mut is = InputStream::new(); - is.read_from_str("This code is © 2023 €", None); + // This function checks to see if there is already an attribute name like the one in current_attr_name. + fn attr_already_exists(&mut self) -> bool { + return self.current_attrs.iter().any(|(name, ..)| name == &self.current_attr_name); + } - let mut tkznr = Tokenizer::new(&mut is); + // Saves the current attribute name and value onto the current_attrs stack, if there is anything to store + fn store_and_clear_current_attribute(&mut self) { + if !self.current_attr_name.is_empty() && ! self.attr_already_exists() { + self.current_attrs.push((self.current_attr_name.clone(), self.current_attr_value.clone())); + } - let t = tkznr.next_token(); - assert_eq!(TokenType::TextToken, t.type_of()); + self.current_attr_name = String::new(); + self.current_attr_value = String::new(); + } - if let Token::TextToken { value } = t { - assert_eq!("This code is © 2023 €", value); + // This method will add current generated attributes to the current (start) token if needed. + fn add_stored_attributes_to_current_token(&mut self) { + if self.current_token.is_none() { + return; + } + if self.current_attrs.is_empty() { + return; } - let t = tkznr.next_token(); - assert_eq!(TokenType::EofToken, t.type_of()); + match self.current_token.as_mut().unwrap() { + Token::EndTagToken { .. } => { + self.parse_error(ParserError::EndTagWithAttributes); + }, + Token::StartTagToken { attributes, .. } => { + for attr in &self.current_attrs { + attributes.push(attr.clone()); + } + self.current_attrs = vec![]; + } + _ => {}, + } } -} +} \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs new file mode 100755 index 000000000..58d532349 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,2 @@ +#[allow(dead_code)] +pub mod html5_parser; diff --git a/src/test_test.rs b/src/test_test.rs deleted file mode 100644 index 63161c676..000000000 --- a/src/test_test.rs +++ /dev/null @@ -1,113 +0,0 @@ -pub struct InputStream { -} - -impl InputStream { - pub fn new() -> Self { - InputStream {} - } -} - -// ======================================================================================= - -pub struct Token; - -impl Token { - fn to_string(&self) -> String { - return String::from("token"); - } -} - -// ======================================================================================= - -pub struct Tokenizer<'a> { - pub stream: &'a mut InputStream, - pub emitter: &'a mut dyn Emitter, -} - -impl<'a> Tokenizer<'a> { - pub fn new(input: &'a mut InputStream, emitter: &'a mut dyn Emitter) -> Self { - return Tokenizer { - stream: input, - emitter, - } - } - - pub fn next_token(&mut self) - { - let t = Token; - self.emitter.emit(t) - } -} - -// ======================================================================================= - -pub struct HtmlParser<'a> { - pub tokenizer: &'a mut Tokenizer<'a>, -} - -impl<'a> HtmlParser<'a> { - pub fn new(tokenizer: &'a mut Tokenizer<'a>) -> Self { - HtmlParser{ - tokenizer - } - } - - pub fn get_tokenizer(&mut self) -> &mut Tokenizer<'a> { - return self.tokenizer; - } -} - -// ======================================================================================= - -pub trait Emitter { - fn emit(&mut self, t: Token); -} - -pub struct StrEmitter { - pub output: String -} - -impl StrEmitter { - pub fn new() -> Self { - StrEmitter { - output: String::new(), - } - } - - fn get_output(&self) -> &String { - return &self.output; - } -} - -impl Emitter for StrEmitter { - fn emit(&mut self, t: Token) { - self.output.push_str(&*t.to_string()); - } -} - -pub struct AppEmitter; - -impl AppEmitter { - pub fn new() -> Self { - AppEmitter - } -} - -impl Emitter for AppEmitter { - fn emit(&mut self, t: Token) { - println!("O [{}]", t.to_string()); - } -} - -// ======================================================================================= - -pub fn main() { - let mut is = InputStream::new(); - let mut e = AppEmitter::new(); - let mut t = Tokenizer::new(&mut is, &mut e); - - let mut p = HtmlParser::new(&mut t); - - p.get_tokenizer().next_token(); - // println!("Output: {}", e.get_output()) -}