From 06326f7c997f206c75c5915c73dec3d5db3640fb Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Wed, 24 Apr 2024 22:25:25 -0400 Subject: [PATCH] Output parse errors for the Rust part of the build step This fixes #290, by outputting the parse errors encountered by the Rust build step's parser. Previously they were being stored in the RcDom instance's errors vector, and ignored. Now they are threaded through to the final io::Result, and then output by main(). The hardest part of this was adding line numbers to the errors. Doing this necessitated creating a wrapper for RcDom, called RcDomWithLineNumbers, which implements TreeSink with two methods parse_error() and set_current_line() given custom behavior, while the other many methods just delegate to RcDom's implementation. Additionally, this enables exact_errors as a parser option, which provides slightly more information in a couple of cases related to character references. --- Cargo.lock | 12 +++ Cargo.toml | 1 + src/annotate_attributes.rs | 15 ++-- src/boilerplate.rs | 27 ++++--- src/interface_index.rs | 28 ++++--- src/main.rs | 10 +++ src/parser.rs | 102 +++++++++++++++++++++--- src/rcdom_with_line_numbers.rs | 141 +++++++++++++++++++++++++++++++++ src/represents.rs | 6 +- src/tag_omission.rs | 3 +- 10 files changed, 303 insertions(+), 42 deletions(-) create mode 100644 src/rcdom_with_line_numbers.rs diff --git a/Cargo.lock b/Cargo.lock index 3e3aca4..de56600 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "delegate" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + [[package]] name = "errno" version = "0.3.1" @@ -111,6 +122,7 @@ checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" name = "html-build" version = "0.0.0" dependencies = [ + "delegate", "html5ever", "markup5ever_rcdom", "regex", diff --git a/Cargo.toml b/Cargo.toml index ba47202..b298fe8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ tokio = { version = "1", features = ["full"] } html5ever = "0.26.0" markup5ever_rcdom = "0.2.0" regex = "1" +delegate = "0.12.0" [dev-dependencies] tempfile = "3" diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs index def2f9d..33a95bf 100644 --- a/src/annotate_attributes.rs +++ b/src/annotate_attributes.rs @@ -313,6 +313,7 @@ mod tests { // reordered in the HTML spec). let document = parse_document_async( r#" +

The a element

Categories @@ -338,7 +339,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Categories
Flow content @@ -369,6 +370,7 @@ mod tests { // i.e., the variant description is used where requested let document = parse_document_async( r#" +

The a element

Content attributes @@ -390,7 +392,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
href @@ -415,6 +417,7 @@ mod tests { // Checks that the special rules for using : instead of an em dash work. let document = parse_document_async( r#" +

The a element

Content attributes @@ -431,7 +434,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
Also, the name attribute has special semantics on this element: Anchor name @@ -450,6 +453,7 @@ mod tests { // Checks that the special rules for joining any special semantics with a ; work. let document = parse_document_async( r#" +

The a element

Content attributes @@ -467,7 +471,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
Also, the name attribute has special semantics on this element: Anchor name; Name of the anchor @@ -488,6 +492,7 @@ mod tests { // repeating the description. let document = parse_document_async( r#" +

The img element

Content attributes @@ -509,7 +514,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The img element

+

The img element

Content attributes
width diff --git a/src/boilerplate.rs b/src/boilerplate.rs index faab19f..5a4a620 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -170,14 +170,16 @@ mod tests { "enEnglish", ) .await?; - let document = - parse_document_async("".as_bytes()).await?; + let document = parse_document_async( + "
".as_bytes(), + ) + .await?; let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "
enEnglish
"); + "
enEnglish
"); Ok(()) } @@ -189,15 +191,16 @@ mod tests { "data:text/html,Hello, world!", ) .await?; - let document = - parse_document_async("\">hello".as_bytes()) - .await?; + let document = parse_document_async( + "\">hello".as_bytes(), + ) + .await?; let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "hello"); + "hello"); Ok(()) } @@ -208,23 +211,23 @@ mod tests { tokio::fs::write(example_dir.path().join("ex2"), "second").await?; tokio::fs::write(example_dir.path().join("ignored"), "bad").await?; let document = - parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) + parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) .await?; let mut proc = Processor::new(Path::new("."), example_dir.path()); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "
first
second

EXAMPLE ignored

" ); + "
first
second

EXAMPLE ignored

" ); Ok(()) } #[tokio::test] async fn test_errors_unsafe_paths() -> io::Result<()> { let bad_path_examples = [ - "", - "
\">
", - "
EXAMPLE ../foo
", + "", + "
\">
", + "
EXAMPLE ../foo
", ]; for example in bad_path_examples { let document = parse_document_async(example.as_bytes()).await?; diff --git a/src/interface_index.rs b/src/interface_index.rs index 7039fd9..b9bb217 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -188,6 +188,7 @@ mod tests { async fn test_two_interfaces_in_one_block() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 interface HTMLBlinkElement { ... }
@@ -204,7 +205,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r#"
-

+

 interface HTMLMarqueeElement { ... }
 interface HTMLBlinkElement { ... }
 
@@ -217,6 +218,7 @@ interface HTMLBlinkElement { ... } async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 
@@ -235,7 +237,7 @@ INSERT INTERFACES HERE assert_eq!( serialize_for_test(&[document]), r#" -

+

 interface HTMLMarqueeElement { ... }
 

@@ -250,6 +252,7 @@ interface HTMLBlinkElement { ... }
     async fn interface_with_partial() -> io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 interface HTMLMarqueeElement { ... }
 
@@ -268,7 +271,7 @@ INSERT INTERFACES HERE assert_eq!( serialize_for_test(&[document]), r##" -

+

 interface HTMLMarqueeElement { ... }
 

@@ -283,6 +286,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -300,7 +304,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r##"
-

+

 interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -314,6 +318,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -330,7 +335,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r##"
-

+

 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
 
@@ -343,6 +348,7 @@ partial interface HTMLMarqueeElement io::Result<()> { let document = parse_document_async( r#" + INSERT INTERFACES HERE

 interface HTMLMarqueeElement { ... }
@@ -357,12 +363,12 @@ interface HTMLMarqueeElement { ... }
         proc.apply()?;
         assert_eq!(
             serialize_for_test(&[document]),
-            r##"
-
  • HTMLMarqueeElement
+ r#" +
  • HTMLMarqueeElement

 interface HTMLMarqueeElement { ... }
 
- "## + "# .trim() ); Ok(()) @@ -370,7 +376,7 @@ interface HTMLMarqueeElement { ... } #[tokio::test] async fn no_marker() -> io::Result<()> { - let document = parse_document_async("".as_bytes()).await?; + let document = parse_document_async("".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); let result = proc.apply(); @@ -381,7 +387,8 @@ interface HTMLMarqueeElement { ... } #[tokio::test] async fn duplicate_marker() -> io::Result<()> { let document = parse_document_async( - "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
".as_bytes(), + "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
" + .as_bytes(), ) .await?; let mut proc = Processor::new(); @@ -395,6 +402,7 @@ interface HTMLMarqueeElement { ... } async fn duplicate_dfn() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 interface HTMLMarqueeElement { ... }
diff --git a/src/main.rs b/src/main.rs
index 975d24e..14ee301 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -14,11 +14,21 @@ mod dom_utils;
 mod interface_index;
 mod io_utils;
 mod parser;
+mod rcdom_with_line_numbers;
 mod represents;
 mod tag_omission;
 
 #[tokio::main]
 async fn main() -> io::Result<()> {
+    // This gives slightly prettier error-printing.
+    if let Err(e) = run().await {
+        eprintln!("{}", e);
+        std::process::exit(1);
+    }
+    Ok(())
+}
+
+async fn run() -> io::Result<()> {
     // Since we're using Rc in the DOM implementation, we must ensure that tasks
     // which act on it are confined to this thread.
 
diff --git a/src/parser.rs b/src/parser.rs
index a10de56..9c9feeb 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -2,15 +2,19 @@
 
 use std::io;
 
-use html5ever::driver::{self, Parser};
+use html5ever::driver::{self, ParseOpts, Parser};
 use html5ever::tendril::{ByteTendril, TendrilSink};
-use markup5ever_rcdom::{Handle, RcDom};
+use html5ever::tokenizer::TokenizerOpts;
+use html5ever::tree_builder::TreeBuilderOpts;
+use markup5ever_rcdom::Handle;
 use tokio::io::{AsyncRead, AsyncReadExt};
 
+use crate::rcdom_with_line_numbers::RcDomWithLineNumbers;
+
 async fn parse_internal_async(
-    parser: Parser,
+    parser: Parser,
     mut r: R,
-) -> io::Result {
+) -> io::Result {
     let mut tendril_sink = parser.from_utf8();
 
     // This draws on the structure of the sync tendril read_from.
@@ -35,7 +39,7 @@ async fn parse_internal_async(
         }
     }
     let dom = tendril_sink.finish();
-    Ok(dom.document)
+    Ok(dom)
 }
 
 pub async fn parse_fragment_async(
@@ -43,12 +47,16 @@ pub async fn parse_fragment_async(
     context: &Handle,
 ) -> io::Result> {
     let parser = driver::parse_fragment_for_element(
-        RcDom::default(),
-        Default::default(),
+        RcDomWithLineNumbers::default(),
+        create_error_opts(),
         context.clone(),
         None,
     );
-    let document = parse_internal_async(parser, r).await?;
+
+    let dom = parse_internal_async(parser, r).await?;
+    dom.create_error_from_parse_errors()?;
+
+    let document = dom.document();
     let mut new_children = document.children.take()[0].children.take();
     for new_child in new_children.iter_mut() {
         new_child.parent.take();
@@ -57,8 +65,24 @@ pub async fn parse_fragment_async(
 }
 
 pub async fn parse_document_async(r: R) -> io::Result {
-    let parser = driver::parse_document(RcDom::default(), Default::default());
-    parse_internal_async(parser, r).await
+    let parser = driver::parse_document(RcDomWithLineNumbers::default(), create_error_opts());
+    let dom = parse_internal_async(parser, r).await?;
+    dom.create_error_from_parse_errors()?;
+
+    Ok(dom.document().clone())
+}
+
+fn create_error_opts() -> ParseOpts {
+    ParseOpts {
+        tokenizer: TokenizerOpts {
+            exact_errors: true,
+            ..Default::default()
+        },
+        tree_builder: TreeBuilderOpts {
+            exact_errors: true,
+            ..Default::default()
+        },
+    }
 }
 
 #[cfg(test)]
@@ -95,7 +119,7 @@ pub(crate) mod tests {
         // we're in. This is important because of the special rules
         // surrounding, e.g., tables. If you change this to use the body as context,
         // no element at all is emitted.
-        let document = parse_document_async("".as_bytes()).await?;
+        let document = parse_document_async("
".as_bytes()).await?; let body = document.children.borrow()[1].children.borrow()[1].clone(); assert!(body.is_html_element(&local_name!("body"))); let table = body.children.borrow()[0].clone(); @@ -104,4 +128,60 @@ pub(crate) mod tests { assert_eq!(serialize_for_test(&children), ""); Ok(()) } + + #[tokio::test] + async fn test_document_error_line_number() -> io::Result<()> { + let result = + parse_document_async("Hello\nworld".as_bytes()) + .await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("Line 2: ")); + + Ok(()) + } + + #[tokio::test] + async fn test_document_error_exact() -> io::Result<()> { + let result = + parse_document_async("&asdf;".as_bytes()) + .await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("&asdf;")); + + Ok(()) + } + + #[tokio::test] + async fn test_fragment_error_line_number() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let result = + parse_fragment_async("Hello \n\nworld".as_bytes(), &body).await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("Line 3: ")); + + Ok(()) + } + + #[tokio::test] + async fn test_fragment_error_exact() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let result = + parse_fragment_async("&asdf;".as_bytes(), &body).await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("&asdf;")); + + Ok(()) + } } diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs new file mode 100644 index 0000000..3f5e6c2 --- /dev/null +++ b/src/rcdom_with_line_numbers.rs @@ -0,0 +1,141 @@ +// This provides a wrapper around RcDom which tracks line numbers in the errors. + +use delegate::delegate; +use html5ever::interface::TreeSink; +use html5ever::{ + tendril::StrTendril, + tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode}, + Attribute, ExpandedName, QualName, +}; +use markup5ever_rcdom::{Handle, RcDom}; +use std::borrow::Cow; +use std::io; + +pub struct RcDomWithLineNumbers { + dom: RcDom, + current_line: u64, +} + +impl RcDomWithLineNumbers { + // Expose out the document and errors from the inner RcDom + pub fn document(&self) -> &Handle { + &self.dom.document + } + + pub fn create_error_from_parse_errors(&self) -> io::Result<()> { + if !self.dom.errors.is_empty() { + let error_messages = self + .dom + .errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n"); + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Parse errors encountered:\n\n{}", error_messages), + )) + } else { + Ok(()) + } + } +} + +impl Default for RcDomWithLineNumbers { + fn default() -> Self { + Self { + dom: RcDom::default(), + current_line: 1, + } + } +} + +impl TreeSink for RcDomWithLineNumbers { + type Output = RcDomWithLineNumbers; + type Handle = ::Handle; + + // Override the parse_error method to add line numbers to the error messages. + fn parse_error(&mut self, msg: Cow<'static, str>) { + let msg_with_line = format!("Line {}: {}", self.current_line, msg); + self.dom.parse_error(Cow::Owned(msg_with_line)); + } + + // Override to track the current line number. + fn set_current_line(&mut self, line: u64) { + self.current_line = line; + } + + // Override to return RcDomWithLineNumbers instead of RcDom. + fn finish(self) -> Self::Output { + self + } + + // Delegate all other methods to RcDom. + delegate! { + to self.dom { + fn get_document(&mut self) -> Self::Handle; + + fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a>; + + fn create_element( + &mut self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Self::Handle; + + fn create_comment(&mut self, text: StrTendril) -> Self::Handle; + + fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle; + + fn append(&mut self, parent: &Self::Handle, child: NodeOrText); + + fn append_based_on_parent_node( + &mut self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ); + + fn append_doctype_to_document( + &mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ); + + fn mark_script_already_started(&mut self, node: &Self::Handle); + + fn pop(&mut self, node: &Self::Handle); + + fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle; + + fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool; + + fn set_quirks_mode(&mut self, mode: QuirksMode); + + fn append_before_sibling( + &mut self, + sibling: &Self::Handle, + new_node: NodeOrText, + ); + + fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec); + + fn associate_with_form( + &mut self, + target: &Self::Handle, + form: &Self::Handle, + nodes: (&Self::Handle, Option<&Self::Handle>), + ); + + fn remove_from_parent(&mut self, target: &Self::Handle); + + fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle); + + fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool; + + fn complete_script(&mut self, node: &Self::Handle) -> NextParserState; + } + } +} diff --git a/src/represents.rs b/src/represents.rs index ebb0474..e357f41 100644 --- a/src/represents.rs +++ b/src/represents.rs @@ -128,13 +128,13 @@ mod tests { #[tokio::test] async fn test_represents() -> io::Result<()> { // Uses can occur either before or after. - let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply()?; assert_eq!( serialize_for_test(&[document]), - "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" + "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" ); Ok(()) } @@ -142,7 +142,7 @@ mod tests { #[tokio::test] async fn test_represents_undefined() -> io::Result<()> { // Uses can occur either before or after. - let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); let result = proc.apply(); diff --git a/src/tag_omission.rs b/src/tag_omission.rs index edbb80a..8676b19 100644 --- a/src/tag_omission.rs +++ b/src/tag_omission.rs @@ -206,6 +206,7 @@ mod tests { async fn test_simple() -> io::Result<()> { let document = parse_document_async( r#" +

Optional tags

A td element does very tdish things and may be very cellular.

An audio element is quite audible.

@@ -258,7 +259,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

Optional tags

+

Optional tags

A td element does very tdish things and may be very cellular.

An audio element is quite audible.

Another section