From 23b962f4cd919339b9e54e96c0aaba97cdf04d29 Mon Sep 17 00:00:00 2001 From: kiyoshika Date: Mon, 6 Nov 2023 20:32:46 -0500 Subject: [PATCH] add document querying --- src/html5/parser.rs | 1 + src/html5/parser/document.rs | 496 +++++++++++++++++++++++++++++++++++ src/html5/parser/query.rs | 130 +++++++++ src/types.rs | 3 + 4 files changed, 630 insertions(+) create mode 100644 src/html5/parser/query.rs diff --git a/src/html5/parser.rs b/src/html5/parser.rs index 3dc516f1f..9a9cd2e4c 100644 --- a/src/html5/parser.rs +++ b/src/html5/parser.rs @@ -1,5 +1,6 @@ mod attr_replacements; pub mod document; +pub mod query; mod quirks; pub mod tree_builder; diff --git a/src/html5/parser/document.rs b/src/html5/parser/document.rs index 41859a062..0c580334b 100755 --- a/src/html5/parser/document.rs +++ b/src/html5/parser/document.rs @@ -4,6 +4,8 @@ use crate::html5::node::data::doctype::DocTypeData; use crate::html5::node::data::{comment::CommentData, text::TextData}; use crate::html5::node::HTML_NAMESPACE; use crate::html5::node::{Node, NodeData, NodeId}; +use crate::html5::parser::query::SearchType; +use crate::html5::parser::query::{Condition, Query}; use crate::html5::parser::quirks::QuirksMode; use crate::html5::parser::tree_builder::TreeBuilder; use crate::html5::util::is_valid_id_attribute_value; @@ -296,6 +298,25 @@ impl Document { self.arena.get_node_mut(*node_id) } + /// Retrieves the next sibling NodeId (to the right) of the reference_node or None. + pub fn get_next_sibling(&self, reference_node: NodeId) -> Option { + let node = self.get_node_by_id(reference_node)?; + let parent = self.get_node_by_id(node.parent?)?; + + let idx = parent + .children + .iter() + .position(|&child_id| child_id == reference_node) + .unwrap(); + + let next_idx = idx + 1; + if parent.children.len() > next_idx { + return Some(parent.children[next_idx]); + } + + None + } + pub fn add_new_node(&mut self, node: Node) -> NodeId { // if a node contains attributes when adding to the tree, // be sure to handle the special attributes "id" and "class" @@ -418,6 +439,21 @@ impl Document { pub fn has_cyclic_reference(&self, node_id: NodeId, parent_id: NodeId) -> bool { has_child_recursive(&self.arena, node_id, parent_id) } + + /// Check if a given node's children contain a certain tag name + pub fn contains_child_tag(&self, node_id: NodeId, tag: &str) -> bool { + if let Some(node) = self.get_node_by_id(node_id) { + for child_id in &node.children { + if let Some(child) = self.get_node_by_id(*child_id) { + if child.name == tag { + return true; + } + } + } + } + + false + } } /// Returns true when the parent node has the child node as a child, or if any of the children of @@ -692,6 +728,102 @@ impl DocumentHandle { Ok(()) } + + fn matches_query_condition( + &self, + doc_read: &Document, + current_node: &Node, + condition: &Condition, + ) -> bool { + match condition { + Condition::EqualsTag(tag) => current_node.name == *tag, + Condition::EqualsId(id) => { + let node_data = ¤t_node.data; + if let NodeData::Element(element) = node_data { + if let Some(id_attr) = element.attributes.get("id") { + return *id_attr == *id; + } + } + + false + } + Condition::ContainsClass(class) => { + let node_data = ¤t_node.data; + if let NodeData::Element(element) = node_data { + return element.classes.contains(class.as_str()); + } + + false + } + Condition::ContainsAttribute(attribute) => { + let node_data = ¤t_node.data; + if let NodeData::Element(element) = node_data { + return element.attributes.contains_key(attribute); + } + + false + } + Condition::ContainsChildTag(child_tag) => { + doc_read.contains_child_tag(current_node.id, child_tag) + } + Condition::HasParentTag(parent_tag) => { + if let Some(parent_id) = current_node.parent { + // making an assumption here that the parent node is actually valid + let parent = doc_read.get_node_by_id(parent_id).unwrap(); + return parent.name == *parent_tag; + } + + false + } + } + } + + /// Perform a single query against the document. + /// If query search type is uninitialized, returns an error. + /// Otherwise, returns a vector of NodeIds that match the predicate in tree order (preorder depth-first.) + pub fn query(&self, query: &Query) -> Result> { + if query.search_type == SearchType::Uninitialized { + return Err(Error::Query("Query predicate is uninitialized".to_owned())); + } + + let mut found_ids = Vec::new(); + + let mut node_stack: Vec = Vec::new(); + let root_id = self.get().get_root().id; + node_stack.push(root_id); + + let doc_read = self.get(); + + while let Some(current_node_id) = node_stack.pop() { + let current_node = doc_read.get_node_by_id(current_node_id).unwrap(); + + if let Some(sibling_id) = doc_read.get_next_sibling(current_node_id) { + node_stack.push(sibling_id); + } + + if !current_node.children.is_empty() { + node_stack.push(current_node.children[0]); + } + + let mut predicate_result: bool = true; + + for condition in &query.conditions { + if !self.matches_query_condition(&doc_read, current_node, condition) { + predicate_result = false; + break; + } + } + + if predicate_result { + found_ids.push(current_node_id); + if query.search_type == SearchType::FindFirst { + return Ok(found_ids); + } + } + } + + Ok(found_ids) + } } impl TreeBuilder for DocumentHandle { @@ -770,6 +902,7 @@ impl DocumentBuilder { mod tests { use crate::html5::node::{NodeTrait, NodeType, HTML_NAMESPACE}; use crate::html5::parser::document::{DocumentBuilder, DocumentTaskQueue}; + use crate::html5::parser::query::Query; use crate::html5::parser::tree_builder::TreeBuilder; use crate::html5::parser::{Node, NodeData, NodeId}; use std::collections::HashMap; @@ -1192,4 +1325,367 @@ mod tests { assert!(element.classes.contains("two")); assert!(element.classes.contains("three")); } + + #[test] + fn uninitialized_query() { + let doc = DocumentBuilder::new_document(); + + let query = Query::new(); + let found_ids = doc.query(&query); + if let Err(err) = found_ids { + assert_eq!( + err.to_string(), + "query error: Query predicate is uninitialized" + ); + } else { + panic!() + } + } + + #[test] + fn single_query_equals_tag_find_first() { + //
+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let p_id = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let _ = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + + let query = Query::new().equals_tag("p").find_first(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 1); + assert_eq!(found_ids, [p_id]); + } + + #[test] + fn single_query_equals_tag_find_all() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let p_id = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let p_id_2 = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let p_id_3 = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let p_id_4 = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + + let query = Query::new().equals_tag("p").find_all(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 4); + assert_eq!(found_ids, [p_id, p_id_2, p_id_3, p_id_4]); + } + + #[test] + fn single_query_equals_id() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let p_id_2 = doc.create_element("p", div_id, None, HTML_NAMESPACE); + let res = doc.insert_attribute("id", "myid", p_id_2); + assert!(res.is_ok()); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let _ = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + + let query = Query::new().equals_id("myid").find_first(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 1); + assert_eq!(found_ids, [p_id_2]); + } + + #[test] + fn single_query_contains_class_find_first() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let p_id = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let mut res = doc.insert_attribute("class", "one two", p_id); + assert!(res.is_ok()); + let p_id_2 = doc.create_element("p", div_id, None, HTML_NAMESPACE); + res = doc.insert_attribute("class", "one", p_id_2); + assert!(res.is_ok()); + res = doc.insert_attribute("id", "myid", p_id_2); + assert!(res.is_ok()); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let p_id_3 = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + res = doc.insert_attribute("class", "two three", p_id_3); + assert!(res.is_ok()); + + let p_id_4 = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + res = doc.insert_attribute("class", "three", p_id_4); + assert!(res.is_ok()); + + let query = Query::new().contains_class("two").find_first(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 1); + assert_eq!(found_ids, [p_id]); + } + + #[test] + fn single_query_contains_class_find_all() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let p_id = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let mut res = doc.insert_attribute("class", "one two", p_id); + assert!(res.is_ok()); + let p_id_2 = doc.create_element("p", div_id, None, HTML_NAMESPACE); + res = doc.insert_attribute("class", "one", p_id_2); + assert!(res.is_ok()); + res = doc.insert_attribute("id", "myid", p_id_2); + assert!(res.is_ok()); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let p_id_3 = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + res = doc.insert_attribute("class", "two three", p_id_3); + assert!(res.is_ok()); + + let p_id_4 = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + res = doc.insert_attribute("class", "three", p_id_4); + assert!(res.is_ok()); + + let query = Query::new().contains_class("two").find_all(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 2); + assert_eq!(found_ids, [p_id, p_id_3]); + } + + #[test] + fn single_query_contains_attribute_find_first() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let mut res = doc.insert_attribute("id", "myid", div_id_2); + assert!(res.is_ok()); + res = doc.insert_attribute("style", "somestyle", div_id_2); + assert!(res.is_ok()); + let p_id = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + res = doc.insert_attribute("title", "key", p_id); + assert!(res.is_ok()); + let _ = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + res = doc.insert_attribute("style", "otherstyle", div_id_3); + assert!(res.is_ok()); + res = doc.insert_attribute("id", "otherid", div_id_3); + assert!(res.is_ok()); + let _ = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let p_id_4 = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + res = doc.insert_attribute("title", "yo", p_id_4); + assert!(res.is_ok()); + res = doc.insert_attribute("style", "cat", p_id_4); + assert!(res.is_ok()); + + let query = Query::new().contains_attribute("style").find_first(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 1); + assert_eq!(found_ids, [div_id_2]); + } + + #[test] + fn single_query_contains_attribute_find_all() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let mut res = doc.insert_attribute("id", "myid", div_id_2); + assert!(res.is_ok()); + res = doc.insert_attribute("style", "somestyle", div_id_2); + assert!(res.is_ok()); + let p_id = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + res = doc.insert_attribute("title", "key", p_id); + assert!(res.is_ok()); + let _ = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + res = doc.insert_attribute("style", "otherstyle", div_id_3); + assert!(res.is_ok()); + res = doc.insert_attribute("id", "otherid", div_id_3); + assert!(res.is_ok()); + let _ = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let p_id_4 = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + res = doc.insert_attribute("title", "yo", p_id_4); + assert!(res.is_ok()); + res = doc.insert_attribute("style", "cat", p_id_4); + assert!(res.is_ok()); + + let query = Query::new().contains_attribute("style").find_all(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 3); + assert_eq!(found_ids, [div_id_2, div_id_3, p_id_4]); + } + + #[test] + fn single_query_contains_child_find_first() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let _ = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + + let query = Query::new().contains_child_tag("p").find_first(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 1); + assert_eq!(found_ids, [NodeId::root()]); + } + + #[test] + fn single_query_contains_child_find_all() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let _ = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + + let query = Query::new().contains_child_tag("p").find_all(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 4); + assert_eq!(found_ids, [NodeId::root(), div_id, div_id_2, div_id_3]); + } + + #[test] + fn single_query_has_parent_find_first() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let _ = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let _ = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + + let query = Query::new().has_parent_tag("div").find_first(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 1); + assert_eq!(found_ids, [div_id_2]); + } + + #[test] + fn single_query_has_parent_find_all() { + //

+ //
+ //

+ //

+ //

+ //

+ //

+ let mut doc = DocumentBuilder::new_document(); + + let div_id = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let div_id_2 = doc.create_element("div", div_id, None, HTML_NAMESPACE); + let p_id = doc.create_element("p", div_id_2, None, HTML_NAMESPACE); + let p_id_2 = doc.create_element("p", div_id, None, HTML_NAMESPACE); + + let div_id_3 = doc.create_element("div", NodeId::root(), None, HTML_NAMESPACE); + let p_id_3 = doc.create_element("p", div_id_3, None, HTML_NAMESPACE); + + let _ = doc.create_element("p", NodeId::root(), None, HTML_NAMESPACE); + + let query = Query::new().has_parent_tag("div").find_all(); + let found_ids = doc.query(&query).unwrap(); + assert_eq!(found_ids.len(), 4); + assert_eq!(found_ids, [div_id_2, p_id, p_id_2, p_id_3]); + } } diff --git a/src/html5/parser/query.rs b/src/html5/parser/query.rs new file mode 100644 index 000000000..3de9a3c28 --- /dev/null +++ b/src/html5/parser/query.rs @@ -0,0 +1,130 @@ +#[derive(Debug, PartialEq)] +pub enum Condition { + EqualsTag(String), + EqualsId(String), + ContainsClass(String), + ContainsAttribute(String), + ContainsChildTag(String), + HasParentTag(String), +} + +#[derive(Debug, PartialEq)] +pub enum SearchType { + Uninitialized, + FindFirst, + FindAll, +} + +pub struct Query { + pub(crate) conditions: Vec, + pub(crate) search_type: SearchType, +} + +impl Query { + pub(crate) fn new() -> Self { + Self { + conditions: Vec::new(), + search_type: SearchType::Uninitialized, + } + } + + pub(crate) fn equals_tag(mut self, tag_name: &str) -> Self { + self.conditions + .push(Condition::EqualsTag(tag_name.to_owned())); + self + } + + pub(crate) fn equals_id(mut self, id: &str) -> Self { + self.conditions.push(Condition::EqualsId(id.to_owned())); + self + } + + pub(crate) fn contains_class(mut self, class: &str) -> Self { + self.conditions + .push(Condition::ContainsClass(class.to_owned())); + self + } + + pub(crate) fn contains_attribute(mut self, attribute: &str) -> Self { + self.conditions + .push(Condition::ContainsAttribute(attribute.to_owned())); + self + } + + pub(crate) fn contains_child_tag(mut self, child_tag: &str) -> Self { + self.conditions + .push(Condition::ContainsChildTag(child_tag.to_owned())); + self + } + + pub(crate) fn has_parent_tag(mut self, parent_tag: &str) -> Self { + self.conditions + .push(Condition::HasParentTag(parent_tag.to_owned())); + self + } + + pub(crate) fn find_first(mut self) -> Self { + self.search_type = SearchType::FindFirst; + self + } + + pub(crate) fn find_all(mut self) -> Self { + self.search_type = SearchType::FindAll; + self + } +} + +#[cfg(test)] +mod tests { + use crate::html5::parser::query::{Condition, Query, SearchType}; + + #[test] + fn uninitialized() { + let query = Query::new().equals_tag("div").equals_id("myid"); + assert_eq!(query.search_type, SearchType::Uninitialized); + } + + #[test] + fn find_first() { + let query = Query::new().find_first(); + assert_eq!(query.search_type, SearchType::FindFirst); + } + + #[test] + fn find_all() { + let query = Query::new().find_all(); + assert_eq!(query.search_type, SearchType::FindAll); + } + + #[test] + fn build_conditions() { + let query = Query::new() + .equals_tag("div") + .equals_id("myid") + .contains_class("myclass") + .contains_attribute("myattr") + .contains_child_tag("h1") + .has_parent_tag("html") + .find_first(); + + assert_eq!(query.conditions.len(), 6); + assert_eq!(query.conditions[0], Condition::EqualsTag("div".to_owned())); + assert_eq!(query.conditions[1], Condition::EqualsId("myid".to_owned())); + assert_eq!( + query.conditions[2], + Condition::ContainsClass("myclass".to_owned()) + ); + assert_eq!( + query.conditions[3], + Condition::ContainsAttribute("myattr".to_owned()) + ); + assert_eq!( + query.conditions[4], + Condition::ContainsChildTag("h1".to_owned()) + ); + assert_eq!( + query.conditions[5], + Condition::HasParentTag("html".to_owned()) + ); + } +} diff --git a/src/types.rs b/src/types.rs index e889a19bc..639f310dc 100644 --- a/src/types.rs +++ b/src/types.rs @@ -37,6 +37,9 @@ pub enum Error { #[error("document task error: {0}")] DocumentTask(String), + + #[error("query error: {0}")] + Query(String), } /// Result that can be returned which holds either T or an Error