From 2b6ba9e03d8cfcd40f1c26cfe6ad571f3b35a837 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Fri, 9 Feb 2024 14:23:18 +0900 Subject: [PATCH 1/6] Hacky WIP on more parse errors --- src/parser.rs | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index a10de56..adcb6ee 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,15 +2,17 @@ use std::io; -use html5ever::driver::{self, Parser}; +use html5ever::driver::{self, ParseOpts, Parser}; use html5ever::tendril::{ByteTendril, TendrilSink}; +use html5ever::tokenizer::TokenizerOpts; +use html5ever::tree_builder::TreeBuilderOpts; use markup5ever_rcdom::{Handle, RcDom}; use tokio::io::{AsyncRead, AsyncReadExt}; async fn parse_internal_async( parser: Parser, mut r: R, -) -> io::Result { +) -> io::Result { let mut tendril_sink = parser.from_utf8(); // This draws on the structure of the sync tendril read_from. @@ -35,7 +37,7 @@ async fn parse_internal_async( } } let dom = tendril_sink.finish(); - Ok(dom.document) + Ok(dom) } pub async fn parse_fragment_async( @@ -44,11 +46,12 @@ pub async fn parse_fragment_async( ) -> io::Result> { let parser = driver::parse_fragment_for_element( RcDom::default(), - Default::default(), + create_error_opts(), context.clone(), None, ); - let document = parse_internal_async(parser, r).await?; + // TODO handle errors here too I guess + let document = parse_internal_async(parser, r).await?.document; let mut new_children = document.children.take()[0].children.take(); for new_child in new_children.iter_mut() { new_child.parent.take(); @@ -57,8 +60,34 @@ pub async fn parse_fragment_async( } pub async fn parse_document_async(r: R) -> io::Result { - let parser = driver::parse_document(RcDom::default(), Default::default()); - parse_internal_async(parser, r).await + let parser = driver::parse_document(RcDom::default(), create_error_opts()); + let dom = parse_internal_async(parser, r).await?; + + if !dom.errors.is_empty() { + for error in dom.errors { + eprintln!("{}", error); + } + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Parse errors encountered" + ), + )) + } + Ok(dom.document) +} + +fn create_error_opts() -> ParseOpts { + ParseOpts { + tokenizer: TokenizerOpts { + exact_errors: true, + ..Default::default() + }, + tree_builder: TreeBuilderOpts { + exact_errors: true, + ..Default::default() + }, + } } #[cfg(test)] From d712891dbeaf58cd450c39b7b8ad61b4ebe5173b Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 15 Feb 2024 14:50:01 +0900 Subject: [PATCH 2/6] This seems to basically work --- src/annotate_attributes.rs | 15 ++- src/boilerplate.rs | 27 ++--- src/interface_index.rs | 24 +++-- src/main.rs | 1 + src/parser.rs | 55 ++++++---- src/rcdom_with_line_numbers.rs | 179 +++++++++++++++++++++++++++++++++ src/represents.rs | 6 +- src/tag_omission.rs | 3 +- 8 files changed, 261 insertions(+), 49 deletions(-) create mode 100644 src/rcdom_with_line_numbers.rs diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs index def2f9d..33a95bf 100644 --- a/src/annotate_attributes.rs +++ b/src/annotate_attributes.rs @@ -313,6 +313,7 @@ mod tests { // reordered in the HTML spec). let document = parse_document_async( r#" +

The a element

Categories @@ -338,7 +339,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Categories
Flow content @@ -369,6 +370,7 @@ mod tests { // i.e., the variant description is used where requested let document = parse_document_async( r#" +

The a element

Content attributes @@ -390,7 +392,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
href @@ -415,6 +417,7 @@ mod tests { // Checks that the special rules for using : instead of an em dash work. let document = parse_document_async( r#" +

The a element

Content attributes @@ -431,7 +434,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
Also, the name attribute has special semantics on this element: Anchor name @@ -450,6 +453,7 @@ mod tests { // Checks that the special rules for joining any special semantics with a ; work. let document = parse_document_async( r#" +

The a element

Content attributes @@ -467,7 +471,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
Also, the name attribute has special semantics on this element: Anchor name; Name of the anchor @@ -488,6 +492,7 @@ mod tests { // repeating the description. let document = parse_document_async( r#" +

The img element

Content attributes @@ -509,7 +514,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The img element

+

The img element

Content attributes
width diff --git a/src/boilerplate.rs b/src/boilerplate.rs index faab19f..5a4a620 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -170,14 +170,16 @@ mod tests { "enEnglish", ) .await?; - let document = - parse_document_async("".as_bytes()).await?; + let document = parse_document_async( + "
".as_bytes(), + ) + .await?; let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "
enEnglish
"); + "
enEnglish
"); Ok(()) } @@ -189,15 +191,16 @@ mod tests { "data:text/html,Hello, world!", ) .await?; - let document = - parse_document_async("\">hello".as_bytes()) - .await?; + let document = parse_document_async( + "\">hello".as_bytes(), + ) + .await?; let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "hello"); + "hello"); Ok(()) } @@ -208,23 +211,23 @@ mod tests { tokio::fs::write(example_dir.path().join("ex2"), "second").await?; tokio::fs::write(example_dir.path().join("ignored"), "bad").await?; let document = - parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) + parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) .await?; let mut proc = Processor::new(Path::new("."), example_dir.path()); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "
first
second

EXAMPLE ignored

" ); + "
first
second

EXAMPLE ignored

" ); Ok(()) } #[tokio::test] async fn test_errors_unsafe_paths() -> io::Result<()> { let bad_path_examples = [ - "", - "
\">
", - "
EXAMPLE ../foo
", + "", + "
\">
", + "
EXAMPLE ../foo
", ]; for example in bad_path_examples { let document = parse_document_async(example.as_bytes()).await?; diff --git a/src/interface_index.rs b/src/interface_index.rs index 7039fd9..8e8e6e7 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -188,6 +188,7 @@ mod tests { async fn test_two_interfaces_in_one_block() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 interface HTMLBlinkElement { ... }
@@ -204,7 +205,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r#"
-

+

 interface HTMLMarqueeElement { ... }
 interface HTMLBlinkElement { ... }
 
@@ -217,6 +218,7 @@ interface HTMLBlinkElement { ... } async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 
@@ -235,7 +237,7 @@ INSERT INTERFACES HERE assert_eq!( serialize_for_test(&[document]), r#" -

+

 interface HTMLMarqueeElement { ... }
 

@@ -250,6 +252,7 @@ interface HTMLBlinkElement { ... }
     async fn interface_with_partial() -> io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 interface HTMLMarqueeElement { ... }
 
@@ -268,7 +271,7 @@ INSERT INTERFACES HERE assert_eq!( serialize_for_test(&[document]), r##" -

+

 interface HTMLMarqueeElement { ... }
 

@@ -283,6 +286,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -300,7 +304,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r##"
-

+

 interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -314,6 +318,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -330,7 +335,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r##"
-

+

 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
 
@@ -343,6 +348,7 @@ partial interface HTMLMarqueeElement io::Result<()> { let document = parse_document_async( r#" + INSERT INTERFACES HERE

 interface HTMLMarqueeElement { ... }
@@ -358,7 +364,7 @@ interface HTMLMarqueeElement { ... }
         assert_eq!(
             serialize_for_test(&[document]),
             r##"
-
  • HTMLMarqueeElement
+
  • HTMLMarqueeElement

 interface HTMLMarqueeElement { ... }
 
@@ -370,7 +376,7 @@ interface HTMLMarqueeElement { ... } #[tokio::test] async fn no_marker() -> io::Result<()> { - let document = parse_document_async("".as_bytes()).await?; + let document = parse_document_async("".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); let result = proc.apply(); @@ -381,7 +387,8 @@ interface HTMLMarqueeElement { ... } #[tokio::test] async fn duplicate_marker() -> io::Result<()> { let document = parse_document_async( - "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
".as_bytes(), + "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
" + .as_bytes(), ) .await?; let mut proc = Processor::new(); @@ -395,6 +402,7 @@ interface HTMLMarqueeElement { ... } async fn duplicate_dfn() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 interface HTMLMarqueeElement { ... }
diff --git a/src/main.rs b/src/main.rs
index 975d24e..3b256c0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -14,6 +14,7 @@ mod dom_utils;
 mod interface_index;
 mod io_utils;
 mod parser;
+mod rcdom_with_line_numbers;
 mod represents;
 mod tag_omission;
 
diff --git a/src/parser.rs b/src/parser.rs
index adcb6ee..0faf27d 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -6,13 +6,15 @@ use html5ever::driver::{self, ParseOpts, Parser};
 use html5ever::tendril::{ByteTendril, TendrilSink};
 use html5ever::tokenizer::TokenizerOpts;
 use html5ever::tree_builder::TreeBuilderOpts;
-use markup5ever_rcdom::{Handle, RcDom};
+use markup5ever_rcdom::Handle;
 use tokio::io::{AsyncRead, AsyncReadExt};
 
+use crate::rcdom_with_line_numbers::RcDomWithLineNumbers;
+
 async fn parse_internal_async(
-    parser: Parser,
+    parser: Parser,
     mut r: R,
-) -> io::Result {
+) -> io::Result {
     let mut tendril_sink = parser.from_utf8();
 
     // This draws on the structure of the sync tendril read_from.
@@ -45,13 +47,16 @@ pub async fn parse_fragment_async(
     context: &Handle,
 ) -> io::Result> {
     let parser = driver::parse_fragment_for_element(
-        RcDom::default(),
+        RcDomWithLineNumbers::default(),
         create_error_opts(),
         context.clone(),
         None,
     );
-    // TODO handle errors here too I guess
-    let document = parse_internal_async(parser, r).await?.document;
+
+    let dom = parse_internal_async(parser, r).await?;
+    dom.create_error_from_parse_errors()?;
+
+    let document = dom.document();
     let mut new_children = document.children.take()[0].children.take();
     for new_child in new_children.iter_mut() {
         new_child.parent.take();
@@ -60,21 +65,11 @@ pub async fn parse_fragment_async(
 }
 
 pub async fn parse_document_async(r: R) -> io::Result {
-    let parser = driver::parse_document(RcDom::default(), create_error_opts());
+    let parser = driver::parse_document(RcDomWithLineNumbers::default(), create_error_opts());
     let dom = parse_internal_async(parser, r).await?;
+    dom.create_error_from_parse_errors()?;
 
-    if !dom.errors.is_empty() {
-        for error in dom.errors {
-            eprintln!("{}", error);
-        }
-        return Err(io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        format!(
-                            "Parse errors encountered"
-                        ),
-                    ))
-    }
-    Ok(dom.document)
+    Ok(dom.document().clone())
 }
 
 fn create_error_opts() -> ParseOpts {
@@ -124,7 +119,7 @@ pub(crate) mod tests {
         // we're in. This is important because of the special rules
         // surrounding, e.g., tables. If you change this to use the body as context,
         // no element at all is emitted.
-        let document = parse_document_async("".as_bytes()).await?;
+        let document = parse_document_async("
".as_bytes()).await?; let body = document.children.borrow()[1].children.borrow()[1].clone(); assert!(body.is_html_element(&local_name!("body"))); let table = body.children.borrow()[0].clone(); @@ -133,4 +128,24 @@ pub(crate) mod tests { assert_eq!(serialize_for_test(&children), ""); Ok(()) } + + #[tokio::test] + async fn test_document_parse_errors() -> io::Result<()> { + let result = + parse_document_async("Hello world".as_bytes()) + .await; + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } + + #[tokio::test] + async fn test_fragment_parse_errors() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let result = + parse_fragment_async("Hello world".as_bytes(), &body).await; + assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + Ok(()) + } } diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs new file mode 100644 index 0000000..0464c7f --- /dev/null +++ b/src/rcdom_with_line_numbers.rs @@ -0,0 +1,179 @@ +// This provides a wrapper around RcDom which tracks line numbers in the errors. + +use html5ever::interface::TreeSink; +use html5ever::{ + tendril::StrTendril, + tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode}, + Attribute, ExpandedName, QualName, +}; +use markup5ever_rcdom::{Handle, RcDom}; +use std::borrow::Cow; +use std::io; + +pub struct RcDomWithLineNumbers { + dom: RcDom, + current_line: u64, +} + +impl RcDomWithLineNumbers { + // Expose out the document and errors from the inner RcDom + pub fn document(&self) -> &Handle { + &self.dom.document + } + + pub fn create_error_from_parse_errors(&self) -> io::Result<()> { + if !self.dom.errors.is_empty() { + let error_messages = self + .dom + .errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n"); + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Parse errors encountered:\n\n{}", error_messages), + )); + } else { + Ok(()) + } + } +} + +impl Default for RcDomWithLineNumbers { + fn default() -> Self { + Self { + dom: RcDom::default(), + current_line: 1, + } + } +} + +impl TreeSink for RcDomWithLineNumbers { + type Output = RcDomWithLineNumbers; + type Handle = ::Handle; + + // Override the parse_error method to add line numbers to the error messages. + fn parse_error(&mut self, msg: Cow<'static, str>) { + let msg_with_line = format!("Line {}: {}", self.current_line, msg); + self.dom.parse_error(Cow::Owned(msg_with_line)); + } + + // Override to track the current line number. + fn set_current_line(&mut self, line: u64) { + self.current_line = line; + } + + // Override to return RcDomWithLineNumbers instead of RcDom. + fn finish(self) -> Self::Output { + self + } + + // Forward all other methods to RcDom. + + fn get_document(&mut self) -> Self::Handle { + self.dom.get_document() + } + + fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a> { + self.dom.elem_name(target) + } + + fn create_element( + &mut self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Self::Handle { + self.dom.create_element(name, attrs, flags) + } + + fn create_comment(&mut self, text: StrTendril) -> Self::Handle { + self.dom.create_comment(text) + } + + fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle { + self.dom.create_pi(target, data) + } + + fn append(&mut self, parent: &Self::Handle, child: NodeOrText) { + self.dom.append(parent, child) + } + + fn append_based_on_parent_node( + &mut self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ) { + self.dom + .append_based_on_parent_node(element, prev_element, child) + } + + fn append_doctype_to_document( + &mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ) { + self.dom + .append_doctype_to_document(name, public_id, system_id) + } + + fn mark_script_already_started(&mut self, node: &Self::Handle) { + self.dom.mark_script_already_started(node) + } + + fn pop(&mut self, node: &Self::Handle) { + self.dom.pop(node) + } + + fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle { + self.dom.get_template_contents(target) + } + + fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool { + self.dom.same_node(x, y) + } + + fn set_quirks_mode(&mut self, mode: QuirksMode) { + self.dom.set_quirks_mode(mode) + } + + fn append_before_sibling( + &mut self, + sibling: &Self::Handle, + new_node: NodeOrText, + ) { + self.dom.append_before_sibling(sibling, new_node) + } + + fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec) { + self.dom.add_attrs_if_missing(target, attrs) + } + + fn associate_with_form( + &mut self, + target: &Self::Handle, + form: &Self::Handle, + nodes: (&Self::Handle, Option<&Self::Handle>), + ) { + self.dom.associate_with_form(target, form, nodes) + } + + fn remove_from_parent(&mut self, target: &Self::Handle) { + self.dom.remove_from_parent(target) + } + + fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) { + self.dom.reparent_children(node, new_parent) + } + + fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool { + self.dom.is_mathml_annotation_xml_integration_point(handle) + } + + fn complete_script(&mut self, node: &Self::Handle) -> NextParserState { + self.dom.complete_script(node) + } +} diff --git a/src/represents.rs b/src/represents.rs index ebb0474..e357f41 100644 --- a/src/represents.rs +++ b/src/represents.rs @@ -128,13 +128,13 @@ mod tests { #[tokio::test] async fn test_represents() -> io::Result<()> { // Uses can occur either before or after. - let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply()?; assert_eq!( serialize_for_test(&[document]), - "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" + "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" ); Ok(()) } @@ -142,7 +142,7 @@ mod tests { #[tokio::test] async fn test_represents_undefined() -> io::Result<()> { // Uses can occur either before or after. - let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); let result = proc.apply(); diff --git a/src/tag_omission.rs b/src/tag_omission.rs index edbb80a..8676b19 100644 --- a/src/tag_omission.rs +++ b/src/tag_omission.rs @@ -206,6 +206,7 @@ mod tests { async fn test_simple() -> io::Result<()> { let document = parse_document_async( r#" +

Optional tags

A td element does very tdish things and may be very cellular.

An audio element is quite audible.

@@ -258,7 +259,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

Optional tags

+

Optional tags

A td element does very tdish things and may be very cellular.

An audio element is quite audible.

Another section

From 46ae120ec6fc258a29c427b0b7ed981193c8f3a5 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 15 Feb 2024 14:54:38 +0900 Subject: [PATCH 3/6] Slightly prettier error printing --- src/main.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/main.rs b/src/main.rs index 3b256c0..14ee301 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,6 +20,15 @@ mod tag_omission; #[tokio::main] async fn main() -> io::Result<()> { + // This gives slightly prettier error-printing. + if let Err(e) = run().await { + eprintln!("{}", e); + std::process::exit(1); + } + Ok(()) +} + +async fn run() -> io::Result<()> { // Since we're using Rc in the DOM implementation, we must ensure that tasks // which act on it are confined to this thread. From 6b334fb4c05f1e601a86ed6c5bb1171fee76b10a Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 15 Feb 2024 14:55:15 +0900 Subject: [PATCH 4/6] Clippy and format --- src/interface_index.rs | 4 ++-- src/rcdom_with_line_numbers.rs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/interface_index.rs b/src/interface_index.rs index 8e8e6e7..b9bb217 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -363,12 +363,12 @@ interface HTMLMarqueeElement { ... } proc.apply()?; assert_eq!( serialize_for_test(&[document]), - r##" + r#"
  • HTMLMarqueeElement

 interface HTMLMarqueeElement { ... }
 
- "## + "# .trim() ); Ok(()) diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs index 0464c7f..a5a864a 100644 --- a/src/rcdom_with_line_numbers.rs +++ b/src/rcdom_with_line_numbers.rs @@ -30,10 +30,10 @@ impl RcDomWithLineNumbers { .map(|e| e.to_string()) .collect::>() .join("\n"); - return Err(io::Error::new( + Err(io::Error::new( io::ErrorKind::InvalidData, format!("Parse errors encountered:\n\n{}", error_messages), - )); + )) } else { Ok(()) } From edd0fd311e8241daaec5c26815b79cf28d437d27 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 15 Feb 2024 15:12:19 +0900 Subject: [PATCH 5/6] Use delegate crate to slightly reduce boilerplate But since it's a macro, `cargo fmt` can't get inside of it? --- Cargo.lock | 12 +++ Cargo.toml | 1 + src/rcdom_with_line_numbers.rs | 136 ++++++++++++--------------------- 3 files changed, 62 insertions(+), 87 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3e3aca4..de56600 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "delegate" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + [[package]] name = "errno" version = "0.3.1" @@ -111,6 +122,7 @@ checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" name = "html-build" version = "0.0.0" dependencies = [ + "delegate", "html5ever", "markup5ever_rcdom", "regex", diff --git a/Cargo.toml b/Cargo.toml index ba47202..b298fe8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ tokio = { version = "1", features = ["full"] } html5ever = "0.26.0" markup5ever_rcdom = "0.2.0" regex = "1" +delegate = "0.12.0" [dev-dependencies] tempfile = "3" diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs index a5a864a..3f5e6c2 100644 --- a/src/rcdom_with_line_numbers.rs +++ b/src/rcdom_with_line_numbers.rs @@ -1,5 +1,6 @@ // This provides a wrapper around RcDom which tracks line numbers in the errors. +use delegate::delegate; use html5ever::interface::TreeSink; use html5ever::{ tendril::StrTendril, @@ -69,111 +70,72 @@ impl TreeSink for RcDomWithLineNumbers { self } - // Forward all other methods to RcDom. + // Delegate all other methods to RcDom. + delegate! { + to self.dom { + fn get_document(&mut self) -> Self::Handle; - fn get_document(&mut self) -> Self::Handle { - self.dom.get_document() - } + fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a>; - fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a> { - self.dom.elem_name(target) - } + fn create_element( + &mut self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Self::Handle; - fn create_element( - &mut self, - name: QualName, - attrs: Vec, - flags: ElementFlags, - ) -> Self::Handle { - self.dom.create_element(name, attrs, flags) - } + fn create_comment(&mut self, text: StrTendril) -> Self::Handle; - fn create_comment(&mut self, text: StrTendril) -> Self::Handle { - self.dom.create_comment(text) - } + fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle; - fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle { - self.dom.create_pi(target, data) - } + fn append(&mut self, parent: &Self::Handle, child: NodeOrText); - fn append(&mut self, parent: &Self::Handle, child: NodeOrText) { - self.dom.append(parent, child) - } + fn append_based_on_parent_node( + &mut self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ); - fn append_based_on_parent_node( - &mut self, - element: &Self::Handle, - prev_element: &Self::Handle, - child: NodeOrText, - ) { - self.dom - .append_based_on_parent_node(element, prev_element, child) - } + fn append_doctype_to_document( + &mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ); - fn append_doctype_to_document( - &mut self, - name: StrTendril, - public_id: StrTendril, - system_id: StrTendril, - ) { - self.dom - .append_doctype_to_document(name, public_id, system_id) - } + fn mark_script_already_started(&mut self, node: &Self::Handle); - fn mark_script_already_started(&mut self, node: &Self::Handle) { - self.dom.mark_script_already_started(node) - } + fn pop(&mut self, node: &Self::Handle); - fn pop(&mut self, node: &Self::Handle) { - self.dom.pop(node) - } + fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle; - fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle { - self.dom.get_template_contents(target) - } + fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool; - fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool { - self.dom.same_node(x, y) - } + fn set_quirks_mode(&mut self, mode: QuirksMode); - fn set_quirks_mode(&mut self, mode: QuirksMode) { - self.dom.set_quirks_mode(mode) - } + fn append_before_sibling( + &mut self, + sibling: &Self::Handle, + new_node: NodeOrText, + ); - fn append_before_sibling( - &mut self, - sibling: &Self::Handle, - new_node: NodeOrText, - ) { - self.dom.append_before_sibling(sibling, new_node) - } - - fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec) { - self.dom.add_attrs_if_missing(target, attrs) - } + fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec); - fn associate_with_form( - &mut self, - target: &Self::Handle, - form: &Self::Handle, - nodes: (&Self::Handle, Option<&Self::Handle>), - ) { - self.dom.associate_with_form(target, form, nodes) - } + fn associate_with_form( + &mut self, + target: &Self::Handle, + form: &Self::Handle, + nodes: (&Self::Handle, Option<&Self::Handle>), + ); - fn remove_from_parent(&mut self, target: &Self::Handle) { - self.dom.remove_from_parent(target) - } + fn remove_from_parent(&mut self, target: &Self::Handle); - fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle) { - self.dom.reparent_children(node, new_parent) - } + fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle); - fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool { - self.dom.is_mathml_annotation_xml_integration_point(handle) - } + fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool; - fn complete_script(&mut self, node: &Self::Handle) -> NextParserState { - self.dom.complete_script(node) + fn complete_script(&mut self, node: &Self::Handle) -> NextParserState; + } } } From d8a1f808b1b3276d11242ead6d36272bf1eaf461 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Thu, 11 Apr 2024 11:31:56 +0900 Subject: [PATCH 6/6] Expand tests --- src/parser.rs | 48 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 0faf27d..9c9feeb 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -130,22 +130,58 @@ pub(crate) mod tests { } #[tokio::test] - async fn test_document_parse_errors() -> io::Result<()> { + async fn test_document_error_line_number() -> io::Result<()> { let result = - parse_document_async("Hello world".as_bytes()) + parse_document_async("Hello\nworld".as_bytes()) .await; - assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("Line 2: ")); + + Ok(()) + } + + #[tokio::test] + async fn test_document_error_exact() -> io::Result<()> { + let result = + parse_document_async("&asdf;".as_bytes()) + .await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("&asdf;")); + + Ok(()) + } + + #[tokio::test] + async fn test_fragment_error_line_number() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let result = + parse_fragment_async("Hello \n\nworld".as_bytes(), &body).await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("Line 3: ")); + Ok(()) } #[tokio::test] - async fn test_fragment_parse_errors() -> io::Result<()> { + async fn test_fragment_error_exact() -> io::Result<()> { let document = parse_document_async("".as_bytes()).await?; let body = document.children.borrow()[1].children.borrow()[1].clone(); assert!(body.is_html_element(&local_name!("body"))); let result = - parse_fragment_async("Hello world".as_bytes(), &body).await; - assert!(matches!(result, Err(e) if e.kind() == io::ErrorKind::InvalidData)); + parse_fragment_async("&asdf;".as_bytes(), &body).await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("&asdf;")); + Ok(()) } }