diff --git a/Cargo.lock b/Cargo.lock index 3e3aca4..de56600 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,17 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "delegate" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e018fccbeeb50ff26562ece792ed06659b9c2dae79ece77c4456bb10d9bf79b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.18", +] + [[package]] name = "errno" version = "0.3.1" @@ -111,6 +122,7 @@ checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" name = "html-build" version = "0.0.0" dependencies = [ + "delegate", "html5ever", "markup5ever_rcdom", "regex", diff --git a/Cargo.toml b/Cargo.toml index ba47202..b298fe8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,6 +11,7 @@ tokio = { version = "1", features = ["full"] } html5ever = "0.26.0" markup5ever_rcdom = "0.2.0" regex = "1" +delegate = "0.12.0" [dev-dependencies] tempfile = "3" diff --git a/src/annotate_attributes.rs b/src/annotate_attributes.rs index def2f9d..33a95bf 100644 --- a/src/annotate_attributes.rs +++ b/src/annotate_attributes.rs @@ -313,6 +313,7 @@ mod tests { // reordered in the HTML spec). let document = parse_document_async( r#" +
href
@@ -415,6 +417,7 @@ mod tests {
// Checks that the special rules for using : instead of an em dash work.
let document = parse_document_async(
r#"
+
name
attribute has special semantics on this element: Anchor name
@@ -450,6 +453,7 @@ mod tests {
// Checks that the special rules for joining any special semantics with a ; work.
let document = parse_document_async(
r#"
+
name
attribute has special semantics on this element: Anchor name; Name of the anchor
@@ -488,6 +492,7 @@ mod tests {
// repeating the description.
let document = parse_document_async(
r#"
+
width
diff --git a/src/boilerplate.rs b/src/boilerplate.rs
index faab19f..5a4a620 100644
--- a/src/boilerplate.rs
+++ b/src/boilerplate.rs
@@ -170,14 +170,16 @@ mod tests {
"en | English |
en | English |
EXAMPLE ex1
\nEXAMPLE ex2
EXAMPLE ignored
".as_bytes()) + parse_document_async("EXAMPLE ex1
\nEXAMPLE ex2
EXAMPLE ignored
".as_bytes()) .await?; let mut proc = Processor::new(Path::new("."), example_dir.path()); dom_utils::scan_dom(&document, &mut |h| proc.visit(h)); proc.apply().await?; assert_eq!( serialize_for_test(&[document]), - "first
second
EXAMPLE ignored
" ); + "first
second
EXAMPLE ignored
" ); Ok(()) } #[tokio::test] async fn test_errors_unsafe_paths() -> io::Result<()> { let bad_path_examples = [ - "", - "\">", - "
EXAMPLE ../foo", + "", + "
\">", + "
EXAMPLE ../foo", ]; for example in bad_path_examples { let document = parse_document_async(example.as_bytes()).await?; diff --git a/src/interface_index.rs b/src/interface_index.rs index 7039fd9..b9bb217 100644 --- a/src/interface_index.rs +++ b/src/interface_index.rs @@ -188,6 +188,7 @@ mod tests { async fn test_two_interfaces_in_one_block() -> io::Result<()> { let document = parse_document_async( r#" +
interface HTMLMarqueeElement { ... }
interface HTMLBlinkElement { ... }
@@ -204,7 +205,7 @@ INSERT INTERFACES HERE
assert_eq!(
serialize_for_test(&[document]),
r#"
-
+
interface HTMLMarqueeElement { ... }
interface HTMLBlinkElement { ... }
@@ -217,6 +218,7 @@ interface HTMLBlinkElement { ... }
async fn test_two_interfaces_in_separate_blocks() -> io::Result<()> {
let document = parse_document_async(
r#"
+
interface HTMLMarqueeElement { ... }
@@ -235,7 +237,7 @@ INSERT INTERFACES HERE
assert_eq!(
serialize_for_test(&[document]),
r#"
-
+
interface HTMLMarqueeElement { ... }
@@ -250,6 +252,7 @@ interface HTMLBlinkElement { ... }
async fn interface_with_partial() -> io::Result<()> {
let document = parse_document_async(
r#"
+
interface HTMLMarqueeElement { ... }
@@ -268,7 +271,7 @@ INSERT INTERFACES HERE
assert_eq!(
serialize_for_test(&[document]),
r##"
-
+
interface HTMLMarqueeElement { ... }
@@ -283,6 +286,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
let document = parse_document_async(
r#"
+
interface HTMLMarqueeElement { ... }
partial interface HTMLMarqueeElement { ... }
@@ -300,7 +304,7 @@ INSERT INTERFACES HERE
assert_eq!(
serialize_for_test(&[document]),
r##"
-
+
interface HTMLMarqueeElement { ... }
partial interface HTMLMarqueeElement { ... }
partial interface HTMLMarqueeElement { ... }
@@ -314,6 +318,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
let document = parse_document_async(
r#"
+
partial interface HTMLMarqueeElement { ... }
partial interface HTMLMarqueeElement { ... }
@@ -330,7 +335,7 @@ INSERT INTERFACES HERE
assert_eq!(
serialize_for_test(&[document]),
r##"
-
+
partial interface HTMLMarqueeElement { ... }
partial interface HTMLMarqueeElement { ... }
@@ -343,6 +348,7 @@ partial interface HTMLMarqueeElement io::Result<()> {
let document = parse_document_async(
r#"
+
INSERT INTERFACES HERE
interface HTMLMarqueeElement { ... }
@@ -357,12 +363,12 @@ interface HTMLMarqueeElement { ... }
proc.apply()?;
assert_eq!(
serialize_for_test(&[document]),
- r##"
-HTMLMarqueeElement
+ r#"
+HTMLMarqueeElement
interface HTMLMarqueeElement { ... }
- "##
+ "#
.trim()
);
Ok(())
@@ -370,7 +376,7 @@ interface HTMLMarqueeElement { ... }
#[tokio::test]
async fn no_marker() -> io::Result<()> {
- let document = parse_document_async("".as_bytes()).await?;
+ let document = parse_document_async("".as_bytes()).await?;
let mut proc = Processor::new();
dom_utils::scan_dom(&document, &mut |h| proc.visit(h));
let result = proc.apply();
@@ -381,7 +387,8 @@ interface HTMLMarqueeElement { ... }
#[tokio::test]
async fn duplicate_marker() -> io::Result<()> {
let document = parse_document_async(
- "INSERT INTERFACES HEREINSERT INTERFACES HERE".as_bytes(),
+ "INSERT INTERFACES HEREINSERT INTERFACES HERE"
+ .as_bytes(),
)
.await?;
let mut proc = Processor::new();
@@ -395,6 +402,7 @@ interface HTMLMarqueeElement { ... }
async fn duplicate_dfn() -> io::Result<()> {
let document = parse_document_async(
r#"
+
interface HTMLMarqueeElement { ... }
interface HTMLMarqueeElement { ... }
diff --git a/src/main.rs b/src/main.rs
index 975d24e..14ee301 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -14,11 +14,21 @@ mod dom_utils;
mod interface_index;
mod io_utils;
mod parser;
+mod rcdom_with_line_numbers;
mod represents;
mod tag_omission;
#[tokio::main]
async fn main() -> io::Result<()> {
+ // This gives slightly prettier error-printing.
+ if let Err(e) = run().await {
+ eprintln!("{}", e);
+ std::process::exit(1);
+ }
+ Ok(())
+}
+
+async fn run() -> io::Result<()> {
// Since we're using Rc in the DOM implementation, we must ensure that tasks
// which act on it are confined to this thread.
diff --git a/src/parser.rs b/src/parser.rs
index a10de56..9c9feeb 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -2,15 +2,19 @@
use std::io;
-use html5ever::driver::{self, Parser};
+use html5ever::driver::{self, ParseOpts, Parser};
use html5ever::tendril::{ByteTendril, TendrilSink};
-use markup5ever_rcdom::{Handle, RcDom};
+use html5ever::tokenizer::TokenizerOpts;
+use html5ever::tree_builder::TreeBuilderOpts;
+use markup5ever_rcdom::Handle;
use tokio::io::{AsyncRead, AsyncReadExt};
+use crate::rcdom_with_line_numbers::RcDomWithLineNumbers;
+
async fn parse_internal_async(
- parser: Parser,
+ parser: Parser,
mut r: R,
-) -> io::Result {
+) -> io::Result {
let mut tendril_sink = parser.from_utf8();
// This draws on the structure of the sync tendril read_from.
@@ -35,7 +39,7 @@ async fn parse_internal_async(
}
}
let dom = tendril_sink.finish();
- Ok(dom.document)
+ Ok(dom)
}
pub async fn parse_fragment_async(
@@ -43,12 +47,16 @@ pub async fn parse_fragment_async(
context: &Handle,
) -> io::Result> {
let parser = driver::parse_fragment_for_element(
- RcDom::default(),
- Default::default(),
+ RcDomWithLineNumbers::default(),
+ create_error_opts(),
context.clone(),
None,
);
- let document = parse_internal_async(parser, r).await?;
+
+ let dom = parse_internal_async(parser, r).await?;
+ dom.create_error_from_parse_errors()?;
+
+ let document = dom.document();
let mut new_children = document.children.take()[0].children.take();
for new_child in new_children.iter_mut() {
new_child.parent.take();
@@ -57,8 +65,24 @@ pub async fn parse_fragment_async(
}
pub async fn parse_document_async(r: R) -> io::Result {
- let parser = driver::parse_document(RcDom::default(), Default::default());
- parse_internal_async(parser, r).await
+ let parser = driver::parse_document(RcDomWithLineNumbers::default(), create_error_opts());
+ let dom = parse_internal_async(parser, r).await?;
+ dom.create_error_from_parse_errors()?;
+
+ Ok(dom.document().clone())
+}
+
+fn create_error_opts() -> ParseOpts {
+ ParseOpts {
+ tokenizer: TokenizerOpts {
+ exact_errors: true,
+ ..Default::default()
+ },
+ tree_builder: TreeBuilderOpts {
+ exact_errors: true,
+ ..Default::default()
+ },
+ }
}
#[cfg(test)]
@@ -95,7 +119,7 @@ pub(crate) mod tests {
// we're in. This is important because of the special rules
// surrounding, e.g., tables. If you change this to use the body as context,
// no element at all is emitted.
- let document = parse_document_async("".as_bytes()).await?;
+ let document = parse_document_async("
".as_bytes()).await?;
let body = document.children.borrow()[1].children.borrow()[1].clone();
assert!(body.is_html_element(&local_name!("body")));
let table = body.children.borrow()[0].clone();
@@ -104,4 +128,60 @@ pub(crate) mod tests {
assert_eq!(serialize_for_test(&children), "");
Ok(())
}
+
+ #[tokio::test]
+ async fn test_document_error_line_number() -> io::Result<()> {
+ let result =
+ parse_document_async("Hello\nworld".as_bytes())
+ .await;
+
+ let error = result.unwrap_err();
+ assert_eq!(error.kind(), io::ErrorKind::InvalidData);
+ assert!(error.to_string().contains("Line 2: "));
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn test_document_error_exact() -> io::Result<()> {
+ let result =
+ parse_document_async("&asdf;".as_bytes())
+ .await;
+
+ let error = result.unwrap_err();
+ assert_eq!(error.kind(), io::ErrorKind::InvalidData);
+ assert!(error.to_string().contains("&asdf;"));
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn test_fragment_error_line_number() -> io::Result<()> {
+ let document = parse_document_async("".as_bytes()).await?;
+ let body = document.children.borrow()[1].children.borrow()[1].clone();
+ assert!(body.is_html_element(&local_name!("body")));
+ let result =
+ parse_fragment_async("Hello \n\nworld".as_bytes(), &body).await;
+
+ let error = result.unwrap_err();
+ assert_eq!(error.kind(), io::ErrorKind::InvalidData);
+ assert!(error.to_string().contains("Line 3: "));
+
+ Ok(())
+ }
+
+ #[tokio::test]
+ async fn test_fragment_error_exact() -> io::Result<()> {
+ let document = parse_document_async("".as_bytes()).await?;
+ let body = document.children.borrow()[1].children.borrow()[1].clone();
+ assert!(body.is_html_element(&local_name!("body")));
+ let result =
+ parse_fragment_async("&asdf;".as_bytes(), &body).await;
+
+ let error = result.unwrap_err();
+ assert_eq!(error.kind(), io::ErrorKind::InvalidData);
+ assert!(error.to_string().contains("&asdf;"));
+
+ Ok(())
+ }
}
diff --git a/src/rcdom_with_line_numbers.rs b/src/rcdom_with_line_numbers.rs
new file mode 100644
index 0000000..3f5e6c2
--- /dev/null
+++ b/src/rcdom_with_line_numbers.rs
@@ -0,0 +1,141 @@
+// This provides a wrapper around RcDom which tracks line numbers in the errors.
+
+use delegate::delegate;
+use html5ever::interface::TreeSink;
+use html5ever::{
+ tendril::StrTendril,
+ tree_builder::{ElementFlags, NextParserState, NodeOrText, QuirksMode},
+ Attribute, ExpandedName, QualName,
+};
+use markup5ever_rcdom::{Handle, RcDom};
+use std::borrow::Cow;
+use std::io;
+
+pub struct RcDomWithLineNumbers {
+ dom: RcDom,
+ current_line: u64,
+}
+
+impl RcDomWithLineNumbers {
+ // Expose out the document and errors from the inner RcDom
+ pub fn document(&self) -> &Handle {
+ &self.dom.document
+ }
+
+ pub fn create_error_from_parse_errors(&self) -> io::Result<()> {
+ if !self.dom.errors.is_empty() {
+ let error_messages = self
+ .dom
+ .errors
+ .iter()
+ .map(|e| e.to_string())
+ .collect::>()
+ .join("\n");
+ Err(io::Error::new(
+ io::ErrorKind::InvalidData,
+ format!("Parse errors encountered:\n\n{}", error_messages),
+ ))
+ } else {
+ Ok(())
+ }
+ }
+}
+
+impl Default for RcDomWithLineNumbers {
+ fn default() -> Self {
+ Self {
+ dom: RcDom::default(),
+ current_line: 1,
+ }
+ }
+}
+
+impl TreeSink for RcDomWithLineNumbers {
+ type Output = RcDomWithLineNumbers;
+ type Handle = ::Handle;
+
+ // Override the parse_error method to add line numbers to the error messages.
+ fn parse_error(&mut self, msg: Cow<'static, str>) {
+ let msg_with_line = format!("Line {}: {}", self.current_line, msg);
+ self.dom.parse_error(Cow::Owned(msg_with_line));
+ }
+
+ // Override to track the current line number.
+ fn set_current_line(&mut self, line: u64) {
+ self.current_line = line;
+ }
+
+ // Override to return RcDomWithLineNumbers instead of RcDom.
+ fn finish(self) -> Self::Output {
+ self
+ }
+
+ // Delegate all other methods to RcDom.
+ delegate! {
+ to self.dom {
+ fn get_document(&mut self) -> Self::Handle;
+
+ fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> ExpandedName<'a>;
+
+ fn create_element(
+ &mut self,
+ name: QualName,
+ attrs: Vec,
+ flags: ElementFlags,
+ ) -> Self::Handle;
+
+ fn create_comment(&mut self, text: StrTendril) -> Self::Handle;
+
+ fn create_pi(&mut self, target: StrTendril, data: StrTendril) -> Self::Handle;
+
+ fn append(&mut self, parent: &Self::Handle, child: NodeOrText);
+
+ fn append_based_on_parent_node(
+ &mut self,
+ element: &Self::Handle,
+ prev_element: &Self::Handle,
+ child: NodeOrText,
+ );
+
+ fn append_doctype_to_document(
+ &mut self,
+ name: StrTendril,
+ public_id: StrTendril,
+ system_id: StrTendril,
+ );
+
+ fn mark_script_already_started(&mut self, node: &Self::Handle);
+
+ fn pop(&mut self, node: &Self::Handle);
+
+ fn get_template_contents(&mut self, target: &Self::Handle) -> Self::Handle;
+
+ fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool;
+
+ fn set_quirks_mode(&mut self, mode: QuirksMode);
+
+ fn append_before_sibling(
+ &mut self,
+ sibling: &Self::Handle,
+ new_node: NodeOrText,
+ );
+
+ fn add_attrs_if_missing(&mut self, target: &Self::Handle, attrs: Vec);
+
+ fn associate_with_form(
+ &mut self,
+ target: &Self::Handle,
+ form: &Self::Handle,
+ nodes: (&Self::Handle, Option<&Self::Handle>),
+ );
+
+ fn remove_from_parent(&mut self, target: &Self::Handle);
+
+ fn reparent_children(&mut self, node: &Self::Handle, new_parent: &Self::Handle);
+
+ fn is_mathml_annotation_xml_integration_point(&self, handle: &Self::Handle) -> bool;
+
+ fn complete_script(&mut self, node: &Self::Handle) -> NextParserState;
+ }
+ }
+}
diff --git a/src/represents.rs b/src/represents.rs
index ebb0474..e357f41 100644
--- a/src/represents.rs
+++ b/src/represents.rs
@@ -128,13 +128,13 @@ mod tests {
#[tokio::test]
async fn test_represents() -> io::Result<()> {
// Uses can occur either before or after.
- let document = parse_document_async("The chair
element represents a seat\nat a table
.
".as_bytes()).await?;
+ let document = parse_document_async("
The chair
element represents a seat\nat a table
.
".as_bytes()).await?;
let mut proc = Processor::new();
dom_utils::scan_dom(&document, &mut |h| proc.visit(h));
proc.apply()?;
assert_eq!(
serialize_for_test(&[document]),
- "
A seat\nat a table
.
The chair
element represents a seat\nat a table
.
A seat\nat a table
.
"
+ "A seat\nat a table
.
The chair
element represents a seat\nat a table
.
A seat\nat a table
.
"
);
Ok(())
}
@@ -142,7 +142,7 @@ mod tests {
#[tokio::test]
async fn test_represents_undefined() -> io::Result<()> {
// Uses can occur either before or after.
- let document = parse_document_async("The chair
element represents a seat\nat a table
.
".as_bytes()).await?;
+ let document = parse_document_async("
The chair
element represents a seat\nat a table
.
".as_bytes()).await?;
let mut proc = Processor::new();
dom_utils::scan_dom(&document, &mut |h| proc.visit(h));
let result = proc.apply();
diff --git a/src/tag_omission.rs b/src/tag_omission.rs
index edbb80a..8676b19 100644
--- a/src/tag_omission.rs
+++ b/src/tag_omission.rs
@@ -206,6 +206,7 @@ mod tests {
async fn test_simple() -> io::Result<()> {
let document = parse_document_async(
r#"
+
Optional tags
A td
element does very tdish things and may be very cellular.
An audio
element is quite audible.
@@ -258,7 +259,7 @@ mod tests {
assert_eq!(
serialize_for_test(&[document]),
r#"
-Optional tags
+Optional tags
A td
element does very tdish things and may be very cellular.
An audio
element is quite audible.
Another section