From 2b6ba9e03d8cfcd40f1c26cfe6ad571f3b35a837 Mon Sep 17 00:00:00 2001 From: Domenic Denicola Date: Fri, 9 Feb 2024 14:23:18 +0900 Subject: [PATCH] Hacky WIP on more parse errors --- src/parser.rs | 43 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index a10de56..adcb6ee 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -2,15 +2,17 @@ use std::io; -use html5ever::driver::{self, Parser}; +use html5ever::driver::{self, ParseOpts, Parser}; use html5ever::tendril::{ByteTendril, TendrilSink}; +use html5ever::tokenizer::TokenizerOpts; +use html5ever::tree_builder::TreeBuilderOpts; use markup5ever_rcdom::{Handle, RcDom}; use tokio::io::{AsyncRead, AsyncReadExt}; async fn parse_internal_async( parser: Parser, mut r: R, -) -> io::Result { +) -> io::Result { let mut tendril_sink = parser.from_utf8(); // This draws on the structure of the sync tendril read_from. @@ -35,7 +37,7 @@ async fn parse_internal_async( } } let dom = tendril_sink.finish(); - Ok(dom.document) + Ok(dom) } pub async fn parse_fragment_async( @@ -44,11 +46,12 @@ pub async fn parse_fragment_async( ) -> io::Result> { let parser = driver::parse_fragment_for_element( RcDom::default(), - Default::default(), + create_error_opts(), context.clone(), None, ); - let document = parse_internal_async(parser, r).await?; + // TODO handle errors here too I guess + let document = parse_internal_async(parser, r).await?.document; let mut new_children = document.children.take()[0].children.take(); for new_child in new_children.iter_mut() { new_child.parent.take(); @@ -57,8 +60,34 @@ pub async fn parse_fragment_async( } pub async fn parse_document_async(r: R) -> io::Result { - let parser = driver::parse_document(RcDom::default(), Default::default()); - parse_internal_async(parser, r).await + let parser = driver::parse_document(RcDom::default(), create_error_opts()); + let dom = parse_internal_async(parser, r).await?; + + if !dom.errors.is_empty() { + for error in dom.errors { + eprintln!("{}", error); + } + return Err(io::Error::new( + io::ErrorKind::InvalidData, + format!( + "Parse errors encountered" + ), + )) + } + Ok(dom.document) +} + +fn create_error_opts() -> ParseOpts { + ParseOpts { + tokenizer: TokenizerOpts { + exact_errors: true, + ..Default::default() + }, + tree_builder: TreeBuilderOpts { + exact_errors: true, + ..Default::default() + }, + } } #[cfg(test)]