From d7e8e31003b245f1a7b52c76b263dfdf196f7961 Mon Sep 17 00:00:00 2001
From: Joshua Thijssen <jthijssen@noxlogic.nl>
Date: Mon, 2 Oct 2023 18:32:23 +0200
Subject: [PATCH] Cleanup

---
 src/bin/parser_test.rs                     | 83 +++++++++++++--------
 src/html5_parser/parser/adoption_agency.rs | 87 ++++++++++++++--------
 src/html5_parser/parser/mod.rs             | 65 +++++++++-------
 3 files changed, 146 insertions(+), 89 deletions(-)
diff --git a/src/bin/parser_test.rs b/src/bin/parser_test.rs
index 74a484849..e7f87c1a7 100755
--- a/src/bin/parser_test.rs
+++ b/src/bin/parser_test.rs
@@ -9,24 +9,35 @@ use std::path::PathBuf;
 use std::{env, fs, io};
 
 pub struct TestResults {
-    tests: usize,           // Number of tests (as defined in the suite)
-    assertions: usize, // Number of assertions (different combinations of input/output per test)
-    succeeded: usize,  // How many succeeded assertions
-    failed: usize,     // How many failed assertions
-    failed_position: usize, // How many failed assertions where position is not correct
+    /// Number of tests (as defined in the suite)
+    tests: usize,
+    /// Number of assertions (different combinations of input/output per test)
+    assertions: usize,
+    /// How many succeeded assertions
+    succeeded: usize,
+    /// How many failed assertions
+    failed: usize,
+    /// How many failed assertions where position is not correct
+    failed_position: usize,
 }
 
 struct Test {
-    file_path: String,              // Filename of the test
-    line: usize,                    // Line number of the test
-    data: String,                   // input stream
-    errors: Vec<Error>,             // errors
-    document: Vec<String>,          // document tree
-    document_fragment: Vec<String>, // fragment
+    /// Filename of the test
+    file_path: String,
+    /// Line number of the test
+    line: usize,
+    /// input stream
+    data: String,
+    /// errors
+    errors: Vec<Error>,
+    /// document tree
+    document: Vec<String>,
+    /// fragment
+    document_fragment: Vec<String>,
 }
 
 fn main() -> io::Result<()> {
-    let default_dir = "./html5lib-tests";
+    let default_dir = "./tests/data/html5lib-tests";
     let dir = env::args().nth(1).unwrap_or(default_dir.to_string());
 
     let mut results = TestResults {
@@ -41,10 +52,12 @@ fn main() -> io::Result<()> {
         let entry = entry?;
         let path = entry.path();
 
+        // Only run the tests1.dat file for now
         if !path.ends_with("tests1.dat") {
             continue;
         }
 
+        // Skip dirs and non-dat files
         if !path.is_file() || path.extension().unwrap() != "dat" {
             continue;
         }
@@ -54,10 +67,7 @@ fn main() -> io::Result<()> {
 
         let mut test_idx = 1;
         for test in tests {
-            if test_idx == 23 {
-                run_tree_test(test_idx, &test, &mut results);
-            }
-
+            run_tree_test(test_idx, &test, &mut results);
             test_idx += 1;
         }
     }
@@ -66,6 +76,7 @@ fn main() -> io::Result<()> {
     Ok(())
 }
 
+/// Read given tests file and extract all test data
 fn read_tests(file_path: PathBuf) -> io::Result<Vec<Test>> {
     let file = File::open(file_path.clone())?;
     let reader = BufReader::new(file);
@@ -149,6 +160,7 @@ fn run_tree_test(test_idx: usize, test: &Test, results: &mut TestResults) {
 
     let old_failed = results.failed;
 
+    // Do the actual parsing
     let mut is = InputStream::new();
     is.read_from_str(test.data.as_str(), None);
 
@@ -169,18 +181,23 @@ fn run_tree_test(test_idx: usize, test: &Test, results: &mut TestResults) {
             test.errors.len(),
             parse_errors.len()
         );
-    //     for want_err in &test.errors {
-    //         println!("     * Want: '{}' at {}:{}", want_err.code, want_err.line, want_err.col);
-    //     }
-    //     for got_err in &parse_errors {
-    //         println!("     * Got: '{}' at {}:{}", got_err.message, got_err.line, got_err.col);
-    //     }
-    //     results.assertions += 1;
-    //     results.failed += 1;
+
+        for want_err in &test.errors {
+            println!("     * Want: '{}' at {}:{}", want_err.code, want_err.line, want_err.col);
+        }
+        for got_err in &parse_errors {
+            println!("     * Got: '{}' at {}:{}", got_err.message, got_err.line, got_err.col);
+        }
+        results.assertions += 1;
+        results.failed += 1;
     } else {
         println!("✅ Found {} errors", parse_errors.len());
     }
-    //
+
+    // For now, we skip the tests that checks for errors as most of the errors do not match
+    // with the actual tests, as these errors as specific from html5lib. Either we reuse them
+    // or have some kind of mapping to our own errors if we decide to use our custom errors.
+
     // // Check each error messages
     // let mut idx = 0;
     // for error in &test.errors {
@@ -217,6 +234,7 @@ fn run_tree_test(test_idx: usize, test: &Test, results: &mut TestResults) {
     //     idx += 1;
     // }
 
+    // Display additional data if there a failure is found
     if old_failed != results.failed {
         println!("----------------------------------------");
         println!("📄 Input stream: ");
@@ -230,7 +248,8 @@ fn run_tree_test(test_idx: usize, test: &Test, results: &mut TestResults) {
             println!("{}", line);
         }
 
-        std::process::exit(1);
+        // // End at the first failure
+        // std::process::exit(1);
     }
 
     println!("----------------------------------------");
@@ -238,9 +257,12 @@ fn run_tree_test(test_idx: usize, test: &Test, results: &mut TestResults) {
 
 #[derive(PartialEq)]
 enum ErrorResult {
-    Success,         // Found the correct error
-    Failure,         // Didn't find the error (not even with incorrect position)
-    PositionFailure, // Found the error, but on an incorrect position
+    /// Found the correct error
+    Success,
+    /// Didn't find the error (not even with incorrect position)
+    Failure,
+    /// Found the error, but on an incorrect position
+    PositionFailure,
 }
 
 #[derive(PartialEq)]
@@ -251,6 +273,9 @@ pub struct Error {
 }
 
 fn match_document_tree(document: &Document, expected: &Vec<String>) -> bool {
+    // We need a better tree match system. Right now we match the tree based on the (debug) output
+    // of the tree. Instead, we should generate a document-tree from the expected output and compare
+    // it against the current generated tree.
     match_node(0, -1, -1, document, expected).is_some()
 }
 
diff --git a/src/html5_parser/parser/adoption_agency.rs b/src/html5_parser/parser/adoption_agency.rs
index d5be33949..6e31bd67f 100755
--- a/src/html5_parser/parser/adoption_agency.rs
+++ b/src/html5_parser/parser/adoption_agency.rs
@@ -29,7 +29,7 @@ impl<'a> Html5Parser<'a> {
                 .any(|elem| elem == &ActiveElement::NodeId(current_node_id))
         {
             self.open_elements.pop();
-            return AdoptionResult::Completed
+            return AdoptionResult::Completed;
         }
 
         // Step 3
@@ -39,7 +39,7 @@ impl<'a> Html5Parser<'a> {
         loop {
             // Step 4.1
             if outer_loop_counter >= ADOPTION_AGENCY_OUTER_LOOP_DEPTH {
-                return AdoptionResult::Completed
+                return AdoptionResult::Completed;
             }
 
             // Step 4.2
@@ -48,12 +48,19 @@ impl<'a> Html5Parser<'a> {
             // Step 4.3
             let formatting_element_idx = self.find_formatting_element(subject);
             if formatting_element_idx.is_none() {
-                return AdoptionResult::ProcessAsAnyOther
+                return AdoptionResult::ProcessAsAnyOther;
             }
 
-            let formatting_element_idx = formatting_element_idx.expect("formatting element not found");
-            let formatting_element_id = self.active_formatting_elements[formatting_element_idx].node_id().expect("formatting element not found");
-            let formatting_element_node= self.document.get_node_by_id(formatting_element_id).expect("formatting element not found").clone();
+            let formatting_element_idx =
+                formatting_element_idx.expect("formatting element not found");
+            let formatting_element_id = self.active_formatting_elements[formatting_element_idx]
+                .node_id()
+                .expect("formatting element not found");
+            let formatting_element_node = self
+                .document
+                .get_node_by_id(formatting_element_id)
+                .expect("formatting element not found")
+                .clone();
 
             // Step 4.4
             if !open_elements_has_id!(self, formatting_element_id) {
@@ -61,12 +68,11 @@ impl<'a> Html5Parser<'a> {
                 self.active_formatting_elements
                     .remove(formatting_element_idx);
 
-                return AdoptionResult::Completed
+                return AdoptionResult::Completed;
             }
 
             // Step 4.5
-            if !self.is_in_scope(&formatting_element_node.name, Scope::Regular)
-            {
+            if !self.is_in_scope(&formatting_element_node.name, Scope::Regular) {
                 self.parse_error("formatting element not in scope");
                 return AdoptionResult::Completed;
             }
@@ -93,20 +99,34 @@ impl<'a> Html5Parser<'a> {
                 }
 
                 // Remove the formatting element from the list of active formatting elements
-                if let Some(pos) = self.active_formatting_elements.iter().position(|elem| elem == &ActiveElement::NodeId(formatting_element_id)) {
+                if let Some(pos) = self
+                    .active_formatting_elements
+                    .iter()
+                    .position(|elem| elem == &ActiveElement::NodeId(formatting_element_id))
+                {
                     self.active_formatting_elements.remove(pos);
                 }
 
-                return AdoptionResult::Completed
+                return AdoptionResult::Completed;
             }
 
             let furthest_block_idx = furthest_block_idx.expect("furthest block not found");
 
-            let node_id = *self.open_elements.get(furthest_block_idx).expect("node not found");
-            let furthest_block = self.document.get_node_by_id(node_id).expect("node not found").clone();
+            let node_id = *self
+                .open_elements
+                .get(furthest_block_idx)
+                .expect("node not found");
+            let furthest_block = self
+                .document
+                .get_node_by_id(node_id)
+                .expect("node not found")
+                .clone();
 
             // Step 4.9
-            let common_ancestor_id = *self.open_elements.get(formatting_element_idx + 1).expect("node not found");
+            let common_ancestor_id = *self
+                .open_elements
+                .get(formatting_element_idx + 1)
+                .expect("node not found");
 
             // Step 4.10
             let mut bookmark = formatting_element_idx;
@@ -184,14 +204,12 @@ impl<'a> Html5Parser<'a> {
 
             // Step 4.15
             let new_element = match formatting_element_node.data {
-                NodeData::Element { ref attributes, .. } => {
-                     Node::new_element(
-                        formatting_element_node.name.as_str(),
-                        attributes.clone(),
-                        HTML_NAMESPACE,
-                    )
-                }
-                _ => panic!("formatting element is not an element")
+                NodeData::Element { ref attributes, .. } => Node::new_element(
+                    formatting_element_node.name.as_str(),
+                    attributes.clone(),
+                    HTML_NAMESPACE,
+                ),
+                _ => panic!("formatting element is not an element"),
             };
 
             // Step 4.16
@@ -211,7 +229,8 @@ impl<'a> Html5Parser<'a> {
             // Step 4.19
             // Remove formatting element from the stack of open elements, and insert the new element into the stack of open elements immediately below the position of furthest block in that stack.
             self.open_elements.remove(formatting_element_idx);
-            self.open_elements.insert(furthest_block_idx - 1, new_element_id);
+            self.open_elements
+                .insert(furthest_block_idx - 1, new_element_id);
         }
     }
 
@@ -251,7 +270,10 @@ impl<'a> Html5Parser<'a> {
         // Iterate
         for idx in (index_of_formatting_element..self.open_elements.len()).rev() {
             let element_id = self.open_elements[idx];
-            let element = self.document.get_node_by_id(element_id).expect("element not found");
+            let element = self
+                .document
+                .get_node_by_id(element_id)
+                .expect("element not found");
 
             if element.is_special() {
                 return Some(idx);
@@ -261,7 +283,6 @@ impl<'a> Html5Parser<'a> {
         None
     }
 
-
     // Find the formatting element with the given subject between the end of the list and the first marker (or start when there is no marker)
     fn find_formatting_element(&self, subject: &str) -> Option<usize> {
         if self.active_formatting_elements.is_empty() {
@@ -273,15 +294,15 @@ impl<'a> Html5Parser<'a> {
                 ActiveElement::Marker => {
                     // Marker found, do not continue
                     break;
-                },
+                }
                 ActiveElement::NodeId(node_id) => {
                     // Check if the given node is an element with the given subject
-                    let node = self.document.get_node_by_id(node_id).expect("node not found").clone();
-                    if let NodeData::Element {
-                        ref name,
-                        ..
-                    } = node.data
-                    {
+                    let node = self
+                        .document
+                        .get_node_by_id(node_id)
+                        .expect("node not found")
+                        .clone();
+                    if let NodeData::Element { ref name, .. } = node.data {
                         if name == subject {
                             return Some(idx);
                         }
@@ -292,4 +313,4 @@ impl<'a> Html5Parser<'a> {
 
         None
     }
-}
\ No newline at end of file
+}
diff --git a/src/html5_parser/parser/mod.rs b/src/html5_parser/parser/mod.rs
index a896bcd64..5af002265 100644
--- a/src/html5_parser/parser/mod.rs
+++ b/src/html5_parser/parser/mod.rs
@@ -7,6 +7,7 @@ mod quirks;
 use crate::html5_parser::error_logger::{ErrorLogger, ParseError, ParserError};
 use crate::html5_parser::input_stream::InputStream;
 use crate::html5_parser::node::{Node, NodeData, HTML_NAMESPACE, MATHML_NAMESPACE, SVG_NAMESPACE};
+use crate::html5_parser::parser::adoption_agency::AdoptionResult;
 use crate::html5_parser::parser::attr_replacements::{
     MATHML_ADJUSTMENTS, SVG_ADJUSTMENTS, XML_ADJUSTMENTS,
 };
@@ -17,9 +18,8 @@ use crate::html5_parser::tokenizer::token::Token;
 use crate::html5_parser::tokenizer::{Tokenizer, CHAR_NUL};
 use std::cell::RefCell;
 use std::collections::HashMap;
-use std::rc::Rc;
 use std::io::prelude::*;
-use crate::html5_parser::parser::adoption_agency::AdoptionResult;
+use std::rc::Rc;
 
 // Insertion modes as defined in 13.2.4.1
 #[derive(Debug, Copy, Clone, PartialEq)]
@@ -190,8 +190,12 @@ macro_rules! open_elements_has {
 
 macro_rules! open_elements_has_id {
     ($self:expr, $id:expr) => {
-        $self.open_elements.iter().rev().any(|node_id| *node_id == $id)
-    }
+        $self
+            .open_elements
+            .iter()
+            .rev()
+            .any(|node_id| *node_id == $id)
+    };
 }
 
 // Returns the current node: the last node in the open elements list
@@ -1612,8 +1616,7 @@ impl<'a> Html5Parser<'a> {
                 }
             }
 
-
-            self.display_debug_info();
+            // self.display_debug_info();
         }
 
         (
@@ -1840,7 +1843,10 @@ impl<'a> Html5Parser<'a> {
     // Checks if the given element is in given scope
     fn is_in_scope(&self, tag: &str, scope: Scope) -> bool {
         for &node_id in self.open_elements.iter().rev() {
-            let node = self.document.get_node_by_id(node_id).expect("node not found");
+            let node = self
+                .document
+                .get_node_by_id(node_id)
+                .expect("node not found");
 
             if node.name == tag {
                 return true;
@@ -2316,7 +2322,7 @@ impl<'a> Html5Parser<'a> {
                 if let Some(node_id) = self.active_formatting_elements_has_until_marker("a") {
                     self.parse_error("a tag in active formatting elements");
                     match self.run_adoption_agency(&self.current_token.clone()) {
-                        AdoptionResult::Completed => {},
+                        AdoptionResult::Completed => {}
                         AdoptionResult::ProcessAsAnyOther => {
                             any_other_end_tag = true;
                         }
@@ -2361,13 +2367,13 @@ impl<'a> Html5Parser<'a> {
                 if self.is_in_scope("nobr", Scope::Regular) {
                     self.parse_error("nobr tag in scope");
                     match self.run_adoption_agency(&self.current_token.clone()) {
-                        AdoptionResult::Completed => {},
+                        AdoptionResult::Completed => {}
                         AdoptionResult::ProcessAsAnyOther => {
                             any_other_end_tag = true;
                         }
                     }
 
-                    if ! any_other_end_tag {
+                    if !any_other_end_tag {
                         // @todo: do we run this even when we run the adoption agency with out processAsAnyOther?
                         self.reconstruct_formatting();
                     }
@@ -2393,7 +2399,7 @@ impl<'a> Html5Parser<'a> {
                     || name == "u" =>
             {
                 match self.run_adoption_agency(&self.current_token.clone()) {
-                    AdoptionResult::Completed => {},
+                    AdoptionResult::Completed => {}
                     AdoptionResult::ProcessAsAnyOther => {
                         any_other_end_tag = true;
                     }
@@ -2704,7 +2710,11 @@ impl<'a> Html5Parser<'a> {
 
             for idx in (0..self.open_elements.len()).rev() {
                 let node_id = self.open_elements[idx];
-                let node = self.document.get_node_by_id(node_id).expect("node not found").clone();
+                let node = self
+                    .document
+                    .get_node_by_id(node_id)
+                    .expect("node not found")
+                    .clone();
 
                 if node.name == token_name {
                     self.generate_all_implied_end_tags(Some(node.name.as_str()), false);
@@ -3352,14 +3362,13 @@ impl<'a> Html5Parser<'a> {
         adjusted_insertion_location
     }
 
-    #[cfg(debug_assertions)]
     fn display_debug_info(&self) {
         println!("-----------------------------------------\n");
         println!("current token   : {}", self.current_token);
         println!("insertion mode  : {:?}", self.insertion_mode);
         print!("Open elements   : [ ");
         for node_id in &self.open_elements {
-            let node = self.document.get_node_by_id(*node_id).unwrap();
+               let node = self.document.get_node_by_id(*node_id).unwrap();
             print!("({}) {}, ", node_id, node.name);
         }
         println!("]");
@@ -3381,24 +3390,24 @@ impl<'a> Html5Parser<'a> {
         println!("Output:");
         println!("{}", self.document);
 
-        std::io::stdout().flush().ok().expect("Could not flush stdout");
+        std::io::stdout()
+            .flush()
+            .ok()
+            .expect("Could not flush stdout");
     }
 }
 
-
 #[cfg(test)]
 mod test {
-    use crate::html5_parser::input_stream::Encoding;
     use super::*;
+    use crate::html5_parser::input_stream::Encoding;
 
     macro_rules! node_create {
-        ($self:expr, $name:expr) => {
-            {
-                let node = Node::new_element($name, HashMap::new(), HTML_NAMESPACE);
-                let node_id = $self.document.add_node(node, 0);
-                $self.open_elements.push(node_id);
-            }
-        };
+        ($self:expr, $name:expr) => {{
+            let node = Node::new_element($name, HashMap::new(), HTML_NAMESPACE);
+            let node_id = $self.document.add_node(node, 0);
+            $self.open_elements.push(node_id);
+        }};
     }
 
     #[test]
@@ -3480,7 +3489,6 @@ mod test {
         assert_eq!(parser.is_in_scope("xmp", Scope::Button), false);
         assert_eq!(parser.is_in_scope("xmp", Scope::Table), false);
         assert_eq!(parser.is_in_scope("xmp", Scope::Select), false);
-
     }
 
     #[test]
@@ -3622,11 +3630,14 @@ mod test {
     #[test]
     fn test_reconstruct_formatting() {
         let mut stream = InputStream::new();
-        stream.read_from_str("<p><b>bold<i>bold and italic</b>italic</i></p>", Some(Encoding::UTF8));
+        stream.read_from_str(
+            "<p><b>bold<i>bold and italic</b>italic</i></p>",
+            Some(Encoding::UTF8),
+        );
 
         let mut parser = Html5Parser::new(&mut stream);
         parser.parse();
 
         println!("{}", parser.document);
     }
-}
\ No newline at end of file
+}