diff --git a/Cargo.lock b/Cargo.lock index 3e3aca4..de56600 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "delegate" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + [[package]] name = "errno" version = "0.3.1" @@ -111,6 +122,7 @@ checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" name = "html-build" version = "0.0.0" dependencies = [ + "delegate", "html5ever", "markup5ever_rcdom", "regex", diff --git a/Cargo.toml b/Cargo.toml index ba47202..b298fe8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ tokio = { version = "1", features = ["full"] } html5ever = "0.26.0" markup5ever_rcdom = "0.2.0" regex = "1" +delegate = "0.12.0" [dev-dependencies] tempfile = "3" diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs index def2f9d..33a95bf 100644 --- a/src/annotate_attributes.rs +++ b/src/annotate_attributes.rs @@ -313,6 +313,7 @@ mod tests { // reordered in the HTML spec). let document = parse_document_async( r#" +

The a element

Categories @@ -338,7 +339,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Categories
Flow content @@ -369,6 +370,7 @@ mod tests { // i.e., the variant description is used where requested let document = parse_document_async( r#" +

The a element

Content attributes @@ -390,7 +392,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
href @@ -415,6 +417,7 @@ mod tests { // Checks that the special rules for using : instead of an em dash work. let document = parse_document_async( r#" +

The a element

Content attributes @@ -431,7 +434,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
Also, the name attribute has special semantics on this element: Anchor name @@ -450,6 +453,7 @@ mod tests { // Checks that the special rules for joining any special semantics with a ; work. let document = parse_document_async( r#" +

The a element

Content attributes @@ -467,7 +471,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The a element

+

The a element

Content attributes
Also, the name attribute has special semantics on this element: Anchor name; Name of the anchor @@ -488,6 +492,7 @@ mod tests { // repeating the description. let document = parse_document_async( r#" +

The img element

Content attributes @@ -509,7 +514,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

The img element

+

The img element

Content attributes
width diff --git a/src/boilerplate.rs b/src/boilerplate.rs index faab19f..5a4a620 100644 --- a/src/boilerplate.rs +++ b/src/boilerplate.rs @@ -170,14 +170,16 @@ mod tests { "enEnglish", ) .await?; - let document = - parse_document_async("".as_bytes()).await?; + let document = parse_document_async( + "
".as_bytes(), + ) + .await?; let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "
enEnglish
"); + "
enEnglish
"); Ok(()) } @@ -189,15 +191,16 @@ mod tests { "data:text/html,Hello, world!", ) .await?; - let document = - parse_document_async("\">hello".as_bytes()) - .await?; + let document = parse_document_async( + "\">hello".as_bytes(), + ) + .await?; let mut proc = Processor::new(boilerplate_dir.path(), Path::new(".")); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "hello"); + "hello"); Ok(()) } @@ -208,23 +211,23 @@ mod tests { tokio::fs::write(example_dir.path().join("ex2"), "second").await?; tokio::fs::write(example_dir.path().join("ignored"), "bad").await?; let document = - parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) + parse_document_async("
EXAMPLE ex1
\nEXAMPLE ex2  

EXAMPLE ignored

".as_bytes()) .await?; let mut proc = Processor::new(Path::new("."), example_dir.path()); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "
first
second

EXAMPLE ignored

" ); + "
first
second

EXAMPLE ignored

" ); Ok(()) } #[tokio::test] async fn test_errors_unsafe_paths() -> io::Result<()> { let bad_path_examples = [ - "", - "
\">
", - "
EXAMPLE ../foo
", + "", + "
\">
", + "
EXAMPLE ../foo
", ]; for example in bad_path_examples { let document = parse_document_async(example.as_bytes()).await?; diff --git a/src/interface_index.rs b/src/interface_index.rs index 7039fd9..b9bb217 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -188,6 +188,7 @@ mod tests { async fn test_two_interfaces_in_one_block() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 interface HTMLBlinkElement { ... }
@@ -204,7 +205,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r#"
-

+

 interface HTMLMarqueeElement { ... }
 interface HTMLBlinkElement { ... }
 
@@ -217,6 +218,7 @@ interface HTMLBlinkElement { ... } async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 
@@ -235,7 +237,7 @@ INSERT INTERFACES HERE assert_eq!( serialize_for_test(&[document]), r#" -

+

 interface HTMLMarqueeElement { ... }
 

@@ -250,6 +252,7 @@ interface HTMLBlinkElement { ... }
     async fn interface_with_partial() -> io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 interface HTMLMarqueeElement { ... }
 
@@ -268,7 +271,7 @@ INSERT INTERFACES HERE assert_eq!( serialize_for_test(&[document]), r##" -

+

 interface HTMLMarqueeElement { ... }
 

@@ -283,6 +286,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -300,7 +304,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r##"
-

+

 interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -314,6 +318,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
         let document = parse_document_async(
             r#"
+
 

 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
@@ -330,7 +335,7 @@ INSERT INTERFACES HERE
         assert_eq!(
             serialize_for_test(&[document]),
             r##"
-

+

 partial interface HTMLMarqueeElement { ... }
 partial interface HTMLMarqueeElement { ... }
 
@@ -343,6 +348,7 @@ partial interface HTMLMarqueeElement io::Result<()> { let document = parse_document_async( r#" + INSERT INTERFACES HERE

 interface HTMLMarqueeElement { ... }
@@ -357,12 +363,12 @@ interface HTMLMarqueeElement { ... }
         proc.apply()?;
         assert_eq!(
             serialize_for_test(&[document]),
-            r##"
-
  • HTMLMarqueeElement
+ r#" +
  • HTMLMarqueeElement

 interface HTMLMarqueeElement { ... }
 
- "## + "# .trim() ); Ok(()) @@ -370,7 +376,7 @@ interface HTMLMarqueeElement { ... } #[tokio::test] async fn no_marker() -> io::Result<()> { - let document = parse_document_async("".as_bytes()).await?; + let document = parse_document_async("".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); let result = proc.apply(); @@ -381,7 +387,8 @@ interface HTMLMarqueeElement { ... } #[tokio::test] async fn duplicate_marker() -> io::Result<()> { let document = parse_document_async( - "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
".as_bytes(), + "
INSERT INTERFACES HERE
INSERT INTERFACES HERE
" + .as_bytes(), ) .await?; let mut proc = Processor::new(); @@ -395,6 +402,7 @@ interface HTMLMarqueeElement { ... } async fn duplicate_dfn() -> io::Result<()> { let document = parse_document_async( r#" +

 interface HTMLMarqueeElement { ... }
 interface HTMLMarqueeElement { ... }
diff --git a/src/main.rs b/src/main.rs
index 975d24e..14ee301 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -14,11 +14,21 @@ mod dom_utils;
 mod interface_index;
 mod io_utils;
 mod parser;
+mod rcdom_with_line_numbers;
 mod represents;
 mod tag_omission;
 
 #[tokio::main]
 async fn main() -> io::Result<()> {
+    // This gives slightly prettier error-printing.
+    if let Err(e) = run().await {
+        eprintln!("{}", e);
+        std::process::exit(1);
+    }
+    Ok(())
+}
+
+async fn run() -> io::Result<()> {
     // Since we're using Rc in the DOM implementation, we must ensure that tasks
     // which act on it are confined to this thread.
 
diff --git a/src/parser.rs b/src/parser.rs
index a10de56..9c9feeb 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -2,15 +2,19 @@
 
 use std::io;
 
-use html5ever::driver::{self, Parser};
+use html5ever::driver::{self, ParseOpts, Parser};
 use html5ever::tendril::{ByteTendril, TendrilSink};
-use markup5ever_rcdom::{Handle, RcDom};
+use html5ever::tokenizer::TokenizerOpts;
+use html5ever::tree_builder::TreeBuilderOpts;
+use markup5ever_rcdom::Handle;
 use tokio::io::{AsyncRead, AsyncReadExt};
 
+use crate::rcdom_with_line_numbers::RcDomWithLineNumbers;
+
 async fn parse_internal_async(
-    parser: Parser,
+    parser: Parser,
     mut r: R,
-) -> io::Result {
+) -> io::Result {
     let mut tendril_sink = parser.from_utf8();
 
     // This draws on the structure of the sync tendril read_from.
@@ -35,7 +39,7 @@ async fn parse_internal_async(
         }
     }
     let dom = tendril_sink.finish();
-    Ok(dom.document)
+    Ok(dom)
 }
 
 pub async fn parse_fragment_async(
@@ -43,12 +47,16 @@ pub async fn parse_fragment_async(
     context: &Handle,
 ) -> io::Result> {
     let parser = driver::parse_fragment_for_element(
-        RcDom::default(),
-        Default::default(),
+        RcDomWithLineNumbers::default(),
+        create_error_opts(),
         context.clone(),
         None,
     );
-    let document = parse_internal_async(parser, r).await?;
+
+    let dom = parse_internal_async(parser, r).await?;
+    dom.create_error_from_parse_errors()?;
+
+    let document = dom.document();
     let mut new_children = document.children.take()[0].children.take();
     for new_child in new_children.iter_mut() {
         new_child.parent.take();
@@ -57,8 +65,24 @@ pub async fn parse_fragment_async(
 }
 
 pub async fn parse_document_async(r: R) -> io::Result {
-    let parser = driver::parse_document(RcDom::default(), Default::default());
-    parse_internal_async(parser, r).await
+    let parser = driver::parse_document(RcDomWithLineNumbers::default(), create_error_opts());
+    let dom = parse_internal_async(parser, r).await?;
+    dom.create_error_from_parse_errors()?;
+
+    Ok(dom.document().clone())
+}
+
+fn create_error_opts() -> ParseOpts {
+    ParseOpts {
+        tokenizer: TokenizerOpts {
+            exact_errors: true,
+            ..Default::default()
+        },
+        tree_builder: TreeBuilderOpts {
+            exact_errors: true,
+            ..Default::default()
+        },
+    }
 }
 
 #[cfg(test)]
@@ -95,7 +119,7 @@ pub(crate) mod tests {
         // we're in. This is important because of the special rules
         // surrounding, e.g., tables. If you change this to use the body as context,
         // no element at all is emitted.
-        let document = parse_document_async("".as_bytes()).await?;
+        let document = parse_document_async("
".as_bytes()).await?; let body = document.children.borrow()[1].children.borrow()[1].clone(); assert!(body.is_html_element(&local_name!("body"))); let table = body.children.borrow()[0].clone(); @@ -104,4 +128,60 @@ pub(crate) mod tests { assert_eq!(serialize_for_test(&children), ""); Ok(()) } + + #[tokio::test] + async fn test_document_error_line_number() -> io::Result<()> { + let result = + parse_document_async("Hello\nworld".as_bytes()) + .await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("Line 2: ")); + + Ok(()) + } + + #[tokio::test] + async fn test_document_error_exact() -> io::Result<()> { + let result = + parse_document_async("&asdf;".as_bytes()) + .await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("&asdf;")); + + Ok(()) + } + + #[tokio::test] + async fn test_fragment_error_line_number() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let result = + parse_fragment_async("Hello \n\nworld".as_bytes(), &body).await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("Line 3: ")); + + Ok(()) + } + + #[tokio::test] + async fn test_fragment_error_exact() -> io::Result<()> { + let document = parse_document_async("".as_bytes()).await?; + let body = document.children.borrow()[1].children.borrow()[1].clone(); + assert!(body.is_html_element(&local_name!("body"))); + let result = + parse_fragment_async("&asdf;".as_bytes(), &body).await; + + let error = result.unwrap_err(); + assert_eq!(error.kind(), io::ErrorKind::InvalidData); + assert!(error.to_string().contains("&asdf;")); + + Ok(()) + } } diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs new file mode 100644 index 0000000..3f5e6c2 --- /dev/null +++ b/src/rcdom_with_line_numbers.rs @@ -0,0 +1,141 @@ +// This provides a wrapper around RcDom which tracks line numbers in the errors. + +use delegate::delegate; +use html5ever::interface::TreeSink; +use html5ever::{ + tendril::StrTendril, + tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode}, + Attribute, ExpandedName, QualName, +}; +use markup5ever_rcdom::{Handle, RcDom}; +use std::borrow::Cow; +use std::io; + +pub struct RcDomWithLineNumbers { + dom: RcDom, + current_line: u64, +} + +impl RcDomWithLineNumbers { + // Expose out the document and errors from the inner RcDom + pub fn document(&self) -> &Handle { + &self.dom.document + } + + pub fn create_error_from_parse_errors(&self) -> io::Result<()> { + if !self.dom.errors.is_empty() { + let error_messages = self + .dom + .errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n"); + Err(io::Error::new( + io::ErrorKind::InvalidData, + format!("Parse errors encountered:\n\n{}", error_messages), + )) + } else { + Ok(()) + } + } +} + +impl Default for RcDomWithLineNumbers { + fn default() -> Self { + Self { + dom: RcDom::default(), + current_line: 1, + } + } +} + +impl TreeSink for RcDomWithLineNumbers { + type Output = RcDomWithLineNumbers; + type Handle = ::Handle; + + // Override the parse_error method to add line numbers to the error messages. + fn parse_error(&mut self, msg: Cow<'static, str>) { + let msg_with_line = format!("Line {}: {}", self.current_line, msg); + self.dom.parse_error(Cow::Owned(msg_with_line)); + } + + // Override to track the current line number. + fn set_current_line(&mut self, line: u64) { + self.current_line = line; + } + + // Override to return RcDomWithLineNumbers instead of RcDom. + fn finish(self) -> Self::Output { + self + } + + // Delegate all other methods to RcDom. + delegate! { + to self.dom { + fn get_document(&mut self) -> Self::Handle; + + fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a>; + + fn create_element( + &mut self, + name: QualName, + attrs: Vec, + flags: ElementFlags, + ) -> Self::Handle; + + fn create_comment(&mut self, text: StrTendril) -> Self::Handle; + + fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle; + + fn append(&mut self, parent: &Self::Handle, child: NodeOrText); + + fn append_based_on_parent_node( + &mut self, + element: &Self::Handle, + prev_element: &Self::Handle, + child: NodeOrText, + ); + + fn append_doctype_to_document( + &mut self, + name: StrTendril, + public_id: StrTendril, + system_id: StrTendril, + ); + + fn mark_script_already_started(&mut self, node: &Self::Handle); + + fn pop(&mut self, node: &Self::Handle); + + fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle; + + fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool; + + fn set_quirks_mode(&mut self, mode: QuirksMode); + + fn append_before_sibling( + &mut self, + sibling: &Self::Handle, + new_node: NodeOrText, + ); + + fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec); + + fn associate_with_form( + &mut self, + target: &Self::Handle, + form: &Self::Handle, + nodes: (&Self::Handle, Option<&Self::Handle>), + ); + + fn remove_from_parent(&mut self, target: &Self::Handle); + + fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle); + + fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool; + + fn complete_script(&mut self, node: &Self::Handle) -> NextParserState; + } + } +} diff --git a/src/represents.rs b/src/represents.rs index ebb0474..e357f41 100644 --- a/src/represents.rs +++ b/src/represents.rs @@ -128,13 +128,13 @@ mod tests { #[tokio::test] async fn test_represents() -> io::Result<()> { // Uses can occur either before or after. - let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply()?; assert_eq!( serialize_for_test(&[document]), - "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" + "

A seat\nat a table.

The chair element represents a seat\nat a table.

A seat\nat a table.

" ); Ok(()) } @@ -142,7 +142,7 @@ mod tests { #[tokio::test] async fn test_represents_undefined() -> io::Result<()> { // Uses can occur either before or after. - let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; + let document = parse_document_async("

The chair element represents a seat\nat a table.

".as_bytes()).await?; let mut proc = Processor::new(); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); let result = proc.apply(); diff --git a/src/tag_omission.rs b/src/tag_omission.rs index edbb80a..8676b19 100644 --- a/src/tag_omission.rs +++ b/src/tag_omission.rs @@ -206,6 +206,7 @@ mod tests { async fn test_simple() -> io::Result<()> { let document = parse_document_async( r#" +

Optional tags

A td element does very tdish things and may be very cellular.

An audio element is quite audible.

@@ -258,7 +259,7 @@ mod tests { assert_eq!( serialize_for_test(&[document]), r#" -

Optional tags

+

Optional tags

A td element does very tdish things and may be very cellular.

An audio element is quite audible.

Another section