From 0fa2a8b2c1e42167fb9b3fdf646b346515de5a35 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 09:30:34 +0200 Subject: [PATCH 1/7] Change the check! macro to more flexibly define buffers --- src/reader.rs | 251 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 153 insertions(+), 98 deletions(-) diff --git a/src/reader.rs b/src/reader.rs index bb496932..79fb4dff 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1530,6 +1530,43 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] { } } +/// This is just a helper implementation for using `&mut ()` as buffer while reading from an +/// `&[u8]` to unify how the `check!` macro below works. +impl<'a, 'b> XmlSource<'a, &'b mut ()> for &'a [u8] { + fn read_bytes_until( + &mut self, + byte: u8, + _buf: &mut (), + position: &mut usize, + ) -> Result> { + self.read_bytes_until(byte, (), position) + } + + fn read_bang_element( + &mut self, + _buf: &mut (), + position: &mut usize, + ) -> Result> { + self.read_bang_element((), position) + } + + fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { + self.read_element((), position) + } + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + >::skip_whitespace(self, position) + } + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + >::skip_one(self, byte, position) + } + + fn peek_one(&mut self) -> Result> { + >::peek_one(self) + } +} + /// Possible elements started with ` Option<&'static Encoding> { #[cfg(test)] mod test { macro_rules! check { - ($buf:expr) => { + ($(let mut $buf:ident = $init:expr;)?) => { mod read_bytes_until { use crate::reader::XmlSource; // Use Bytes for printing bytes as strings for ASCII range @@ -1826,14 +1863,14 @@ mod test { /// Checks that search in the empty buffer returns `None` #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"".as_ref(); // ^= 0 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), None @@ -1845,14 +1882,14 @@ mod test { /// as a result and set `position` to `len()` #[test] fn non_existent() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"abcdef".as_ref(); // ^= 6 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -1865,14 +1902,14 @@ mod test { /// after match (`1`) #[test] fn at_the_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"*abcdef".as_ref(); // ^= 1 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"")) @@ -1885,14 +1922,14 @@ mod test { /// symbol after match #[test] fn inside() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"abc*def".as_ref(); // ^= 4 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abc")) @@ -1905,14 +1942,14 @@ mod test { /// symbol after match (`len()`) #[test] fn in_the_end() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"abcdef*".as_ref(); // ^= 7 assert_eq!( input - .read_bytes_until(b'*', buf, &mut position) + .read_bytes_until(b'*', $(&mut $buf, )? &mut position) .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -1934,12 +1971,12 @@ mod test { #[test] #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![]]>other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1954,12 +1991,12 @@ mod test { /// is not found, parsing ends with an error #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![CDATA[other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1973,14 +2010,14 @@ mod test { /// Checks that CDATA element without content inside parsed successfully #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![CDATA[]]>other content".as_ref(); // ^= 11 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA["))) @@ -1993,14 +2030,14 @@ mod test { /// a CDATA end sequence do not interrupt CDATA parsing #[test] fn with_content() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); // ^= 28 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content"))) @@ -2034,12 +2071,12 @@ mod test { #[test] #[ignore = "start comment sequence fully checked outside of `read_bang_element`"] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!- -->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2052,12 +2089,12 @@ mod test { #[test] fn not_properly_end() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2070,12 +2107,12 @@ mod test { #[test] fn not_closed1() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!--other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2088,12 +2125,12 @@ mod test { #[test] fn not_closed2() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!-->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2106,12 +2143,12 @@ mod test { #[test] fn not_closed3() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!--->other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2124,14 +2161,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!---->other content".as_ref(); // ^= 6 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!----"))) @@ -2141,14 +2178,14 @@ mod test { #[test] fn with_content() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!--->comment<--->other content".as_ref(); // ^= 17 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!--->comment<---"))) @@ -2167,12 +2204,12 @@ mod test { #[test] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!D other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2185,12 +2222,12 @@ mod test { #[test] fn without_space() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!DOCTYPEother content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2203,14 +2240,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!DOCTYPE>other content".as_ref(); // ^= 9 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!DOCTYPE"))) @@ -2220,12 +2257,12 @@ mod test { #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!DOCTYPE other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2245,12 +2282,12 @@ mod test { #[test] fn not_properly_start() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!d other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2263,12 +2300,12 @@ mod test { #[test] fn without_space() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!doctypeother content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2281,14 +2318,14 @@ mod test { #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!doctype>other content".as_ref(); // ^= 9 assert_eq!( input - .read_bang_element(buf, &mut position) + .read_bang_element($(&mut $buf, )? &mut position) .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!doctype"))) @@ -2298,12 +2335,12 @@ mod test { #[test] fn not_closed() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"!doctype other content".as_ref(); // ^= 0 - match input.read_bang_element(buf, &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2325,12 +2362,12 @@ mod test { /// Checks that nothing was read from empty buffer #[test] fn empty() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"".as_ref(); // ^= 0 - assert_eq!(input.read_element(buf, &mut position).unwrap().map(Bytes), None); + assert_eq!(input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), None); assert_eq!(position, 0); } @@ -2341,13 +2378,13 @@ mod test { #[test] fn empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b">".as_ref(); // ^= 1 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"")) ); assert_eq!(position, 1); @@ -2355,13 +2392,13 @@ mod test { #[test] fn normal() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"tag>".as_ref(); // ^= 4 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"tag")) ); assert_eq!(position, 4); @@ -2369,13 +2406,13 @@ mod test { #[test] fn empty_ns_empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":>".as_ref(); // ^= 2 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":")) ); assert_eq!(position, 2); @@ -2383,13 +2420,13 @@ mod test { #[test] fn empty_ns() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":tag>".as_ref(); // ^= 5 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":tag")) ); assert_eq!(position, 5); @@ -2397,13 +2434,13 @@ mod test { #[test] fn with_attributes() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); // ^= 38 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)) ); assert_eq!(position, 38); @@ -2417,13 +2454,13 @@ mod test { #[test] fn empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"/>".as_ref(); // ^= 2 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"/")) ); assert_eq!(position, 2); @@ -2431,13 +2468,13 @@ mod test { #[test] fn normal() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b"tag/>".as_ref(); // ^= 5 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b"tag/")) ); assert_eq!(position, 5); @@ -2445,13 +2482,13 @@ mod test { #[test] fn empty_ns_empty_tag() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":/>".as_ref(); // ^= 3 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":/")) ); assert_eq!(position, 3); @@ -2459,13 +2496,13 @@ mod test { #[test] fn empty_ns() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = b":tag/>".as_ref(); // ^= 6 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(b":tag/")) ); assert_eq!(position, 6); @@ -2473,13 +2510,13 @@ mod test { #[test] fn with_attributes() { - let buf = $buf; + $(let mut $buf = $init;)? let mut position = 0; let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); // ^= 41 assert_eq!( - input.read_element(buf, &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)) ); assert_eq!(position, 41); @@ -2494,8 +2531,9 @@ mod test { fn cdata() { let doc = "![]]>"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -2509,8 +2547,9 @@ mod test { fn comment() { let doc = "!- -->"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -2524,8 +2563,9 @@ mod test { fn doctype_uppercase() { let doc = "!D>"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2539,8 +2579,9 @@ mod test { fn doctype_lowercase() { let doc = "!d>"; let mut reader = crate::Reader::from_str(doc); + $(let mut $buf = $init;)? - match reader.read_until_close($buf) { + match reader.read_until_close($(&mut $buf)?) { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -2560,9 +2601,10 @@ mod test { #[test] fn start_text() { let mut reader = Reader::from_str("bom"); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::StartText(BytesText::from_escaped_str("bom").into()) ); } @@ -2570,9 +2612,10 @@ mod test { #[test] fn declaration() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Decl(BytesDecl::from_start(BytesStart::borrowed(b"xml ", 3))) ); } @@ -2580,9 +2623,10 @@ mod test { #[test] fn doctype() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::DocType(BytesText::from_escaped_str("x")) ); } @@ -2590,9 +2634,10 @@ mod test { #[test] fn processing_instruction() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::PI(BytesText::from_escaped_str("xml-stylesheet")) ); } @@ -2600,9 +2645,10 @@ mod test { #[test] fn start() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Start(BytesStart::borrowed_name(b"tag")) ); } @@ -2613,9 +2659,10 @@ mod test { // Because we expect invalid XML, do not check that // the end name paired with the start name reader.check_end_names(false); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::End(BytesEnd::borrowed(b"tag")) ); } @@ -2623,9 +2670,10 @@ mod test { #[test] fn empty() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); } @@ -2634,24 +2682,26 @@ mod test { #[test] fn text() { let mut reader = Reader::from_str("text"); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Text(BytesText::from_escaped_str("text")) ); } #[test] fn cdata() { - let mut reader = Reader::from_str(""); + let mut reader =Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::CData(BytesCData::from_str("")) ); } @@ -2659,9 +2709,10 @@ mod test { #[test] fn comment() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Comment(BytesText::from_escaped_str("")) ); } @@ -2669,9 +2720,10 @@ mod test { #[test] fn eof() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($buf).unwrap(), + reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof ); } @@ -2692,30 +2744,32 @@ mod test { #[test] fn bom_detected() { let mut reader = Reader::from_bytes(b"\xFF\xFE"); + $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), WINDOWS_1251); - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } /// Checks that encoding is changed by XML declaration, but only once #[test] fn xml_declaration() { let mut reader = Reader::from_bytes(b""); + $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } } @@ -2724,12 +2778,13 @@ mod test { #[test] fn str_always_has_utf8() { let mut reader = Reader::from_str(""); + $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($buf).unwrap(); + reader.read_event_impl($(&mut $buf)?).unwrap(); assert_eq!(reader.decoder().encoding(), UTF_8); - assert_eq!(reader.read_event_impl($buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } } }; @@ -2737,11 +2792,11 @@ mod test { /// Tests for reader that generates events that borrow from the provided buffer mod buffered { - check!(&mut Vec::new()); + check!(let mut buf = Vec::new();); } /// Tests for reader that generates events that borrow from the input mod borrowed { - check!(()); + check!(let mut buf = ();); } } From 01ff58dd3492ebc7ae46224ba708802785009752 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 10:37:50 +0200 Subject: [PATCH 2/7] Split reader into IoReader and SliceReader This also changes the test cases in the `reader::test::check` macro to allow for reader-specific tests. --- benches/macrobenches.rs | 3 +- src/de/mod.rs | 6 +- src/lib.rs | 2 +- src/reader.rs | 1206 +++++------------------------------- src/reader/io_reader.rs | 662 ++++++++++++++++++++ src/reader/slice_reader.rs | 523 ++++++++++++++++ 6 files changed, 1345 insertions(+), 1057 deletions(-) create mode 100644 src/reader/io_reader.rs create mode 100644 src/reader/slice_reader.rs diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 3358f3a4..4cb02ffe 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -20,8 +20,9 @@ static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml"); // TODO: use fully normalized attribute values fn parse_document(doc: &[u8]) -> XmlResult<()> { let mut r = Reader::from_reader(doc); + let mut buf = Vec::new(); loop { - match r.read_event()? { + match r.read_event_into(&mut buf)? { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { criterion::black_box(attr?.decode_and_unescape_value(&r)?); diff --git a/src/de/mod.rs b/src/de/mod.rs index e564e041..5b0b44d7 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -695,7 +695,7 @@ impl<'de> Deserializer<'de, SliceReader<'de>> { /// Create new deserializer that will borrow data from the specified borrowing reader #[inline] - fn from_borrowing_reader(mut reader: Reader<&'de [u8]>) -> Self { + fn from_borrowing_reader(mut reader: Reader>) -> Self { reader .expand_empty_elements(true) .check_end_names(true) @@ -930,7 +930,7 @@ pub trait XmlRead<'i> { /// You cannot create it, it is created automatically when you call /// [`Deserializer::from_reader`] pub struct IoReader { - reader: Reader, + reader: Reader>, buf: Vec, } @@ -975,7 +975,7 @@ impl<'i, R: BufRead> XmlRead<'i> for IoReader { /// You cannot create it, it is created automatically when you call /// [`Deserializer::from_str`] or [`Deserializer::from_slice`] pub struct SliceReader<'de> { - reader: Reader<&'de [u8]>, + reader: Reader>, } impl<'de> XmlRead<'de> for SliceReader<'de> { diff --git a/src/lib.rs b/src/lib.rs index 70a6c31d..19723764 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -156,5 +156,5 @@ mod writer; #[cfg(feature = "serialize")] pub use crate::errors::serialize::DeError; pub use crate::errors::{Error, Result}; -pub use crate::reader::{Decoder, Reader}; +pub use crate::reader::{Decoder, IoReader, Reader, SliceReader}; pub use crate::writer::{ElementWriter, Writer}; diff --git a/src/reader.rs b/src/reader.rs index 79fb4dff..c80ae082 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1,8 +1,8 @@ //! A module to handle `Reader` use std::borrow::Cow; -use std::io::{self, BufRead, BufReader}; -use std::{fs::File, path::Path, str::from_utf8}; +use std::ops::{Deref, DerefMut}; +use std::str::from_utf8; #[cfg(feature = "encoding")] use encoding_rs::{Encoding, UTF_16BE, UTF_16LE, UTF_8}; @@ -13,6 +13,12 @@ use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; use memchr; +mod io_reader; +mod slice_reader; + +pub use self::io_reader::IoReader; +pub use self::slice_reader::SliceReader; + /// Possible reader states. The state transition diagram (`true` and `false` shows /// value of [`Reader::expand_empty_elements()`] option): /// @@ -103,6 +109,15 @@ impl EncodingRef { } } +/// A trait for the underlying abstracion handling the actual reading part for the [`Reader`]. +pub trait InnerReader: Deref + DerefMut { + /// The real type of the inner reader. + type Reader; + + /// Consumes this abstration returning the underlying reader. + fn into_inner(self) -> Self::Reader; +} + //////////////////////////////////////////////////////////////////////////////////////////////////// /// A low level encoding-agnostic XML event reader. @@ -200,7 +215,7 @@ pub struct Reader { /// Builder methods impl Reader { /// Creates a `Reader` that reads from a given reader. - pub fn from_reader(reader: R) -> Self { + fn from_reader_internal(reader: R) -> Self { Self { reader, opened_buffer: Vec::new(), @@ -323,7 +338,7 @@ impl Reader { } /// Getters -impl Reader { +impl> Reader { /// Consumes `Reader` returning the underlying reader /// /// Can be used to compute line and column of a parsing error position @@ -333,7 +348,7 @@ impl Reader { /// ``` /// # use pretty_assertions::assert_eq; /// use std::{str, io::Cursor}; - /// use quick_xml::Reader; + /// use quick_xml::{IoReader, Reader}; /// use quick_xml::events::Event; /// /// let xml = r#" @@ -343,7 +358,7 @@ impl Reader { /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes())); /// let mut buf = Vec::new(); /// - /// fn into_line_and_column(reader: Reader>) -> (usize, usize) { + /// fn into_line_and_column(reader: Reader>>) -> (usize, usize) { /// let end_pos = reader.buffer_position(); /// let mut cursor = reader.into_inner(); /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned()) @@ -378,7 +393,7 @@ impl Reader { /// } /// ``` pub fn into_inner(self) -> R { - self.reader + self.reader.into_inner() } /// Gets a reference to the underlying reader. @@ -390,7 +405,10 @@ impl Reader { pub fn get_mut(&mut self) -> &mut R { &mut self.reader } +} +/// Getters that are not specific to any inner reader implementation +impl Reader { /// Gets the current byte position in the input data. /// /// Useful when debugging errors. @@ -461,424 +479,8 @@ impl Reader { } } -/// Read methods -impl Reader { - /// Reads the next `Event`. - /// - /// This is the main entry point for reading XML `Event`s. - /// - /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow` - /// internally). - /// - /// Having the possibility to control the internal buffers gives you some additional benefits - /// such as: - /// - /// - Reduce the number of allocations by reusing the same buffer. For constrained systems, - /// you can call `buf.clear()` once you are done with processing the event (typically at the - /// end of your loop). - /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`). - /// - /// # Examples - /// - /// ``` - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_str(xml); - /// reader.trim_text(true); - /// let mut count = 0; - /// let mut buf = Vec::new(); - /// let mut txt = Vec::new(); - /// loop { - /// match reader.read_event_into(&mut buf) { - /// Ok(Event::Start(ref e)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), - /// Ok(Event::Eof) => break, - /// _ => (), - /// } - /// buf.clear(); - /// } - /// println!("Found {} start events", count); - /// println!("Text events: {:?}", txt); - /// ``` - #[inline] - pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec) -> Result> { - self.read_event_impl(buf) - } - - /// Reads the next event and resolves its namespace (if applicable). - /// - /// # Examples - /// - /// ``` - /// use std::str::from_utf8; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// use quick_xml::name::ResolveResult::*; - /// - /// let xml = r#" - /// Test - /// Test 2 - /// "#; - /// let mut reader = Reader::from_str(xml); - /// reader.trim_text(true); - /// let mut count = 0; - /// let mut buf = Vec::new(); - /// let mut ns_buf = Vec::new(); - /// let mut txt = Vec::new(); - /// loop { - /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { - /// Ok((Bound(ns), Event::Start(e))) => { - /// count += 1; - /// match (ns.as_ref(), e.local_name().as_ref()) { - /// (b"www.xxxx", b"tag1") => (), - /// (b"www.yyyy", b"tag2") => (), - /// (ns, n) => panic!("Namespace and local name mismatch"), - /// } - /// println!("Resolved namespace: {:?}", ns); - /// } - /// Ok((Unbound, Event::Start(_))) => { - /// panic!("Element not in any namespace") - /// }, - /// Ok((Unknown(p), Event::Start(_))) => { - /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) - /// } - /// Ok((_, Event::Text(e))) => { - /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) - /// }, - /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), - /// Ok((_, Event::Eof)) => break, - /// _ => (), - /// } - /// buf.clear(); - /// } - /// println!("Found {} start events", count); - /// println!("Text events: {:?}", txt); - /// ``` - pub fn read_namespaced_event<'b, 'ns>( - &mut self, - buf: &'b mut Vec, - namespace_buffer: &'ns mut Vec, - ) -> Result<(ResolveResult<'ns>, Event<'b>)> { - if self.pending_pop { - self.ns_resolver.pop(namespace_buffer); - } - self.pending_pop = false; - match self.read_event_into(buf) { - Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), - Ok(Event::Start(e)) => { - self.ns_resolver.push(&e, namespace_buffer); - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Start(e), - )) - } - Ok(Event::Empty(e)) => { - // For empty elements we need to 'artificially' keep the namespace scope on the - // stack until the next `next()` call occurs. - // Otherwise the caller has no chance to use `resolve` in the context of the - // namespace declarations that are 'in scope' for the empty element alone. - // Ex: - self.ns_resolver.push(&e, namespace_buffer); - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::Empty(e), - )) - } - Ok(Event::End(e)) => { - // notify next `read_namespaced_event()` invocation that it needs to pop this - // namespace scope - self.pending_pop = true; - Ok(( - self.ns_resolver.find(e.name(), namespace_buffer), - Event::End(e), - )) - } - Ok(e) => Ok((ResolveResult::Unbound, e)), - Err(e) => Err(e), - } - } - - /// Reads until end element is found using provided buffer as intermediate - /// storage for events content. This function is supposed to be called after - /// you already read a [`Start`] event. - /// - /// Manages nested cases where parent and child elements have the same name. - /// - /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] - /// will be returned. In particularly, that error will be returned if you call - /// this method without consuming the corresponding [`Start`] event first. - /// - /// If your reader created from a string slice or byte array slice, it is - /// better to use [`read_to_end()`] method, because it will not copy bytes - /// into intermediate buffer. - /// - /// The provided `buf` buffer will be filled only by one event content at time. - /// Before reading of each event the buffer will be cleared. If you know an - /// appropriate size of each event, you can preallocate the buffer to reduce - /// number of reallocations. - /// - /// The `end` parameter should contain name of the end element _in the reader - /// encoding_. It is good practice to always get that parameter using - /// [`BytesStart::to_end()`] method. - /// - /// The correctness of the skipped events does not checked, if you disabled - /// the [`check_end_names`] option. - /// - /// # Namespaces - /// - /// While the [`Reader`] does not support namespace resolution, namespaces - /// does not change the algorithm for comparing names. Although the names - /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the - /// same namespace, are semantically equivalent, `` cannot close - /// ``, because according to [the specification] - /// - /// > The end of every element that begins with a **start-tag** MUST be marked - /// > by an **end-tag** containing a name that echoes the element's type as - /// > given in the **start-tag** - /// - /// # Examples - /// - /// This example shows, how you can skip XML content after you read the - /// start event. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::events::{BytesStart, Event}; - /// use quick_xml::Reader; - /// - /// let mut reader = Reader::from_str(r#" - /// - /// - /// - /// - /// - /// - /// - /// - /// "#); - /// reader.trim_text(true); - /// let mut buf = Vec::new(); - /// - /// let start = BytesStart::borrowed_name(b"outer"); - /// let end = start.to_end().into_owned(); - /// - /// // First, we read a start event... - /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); - /// - /// //...then, we could skip all events to the corresponding end event. - /// // This call will correctly handle nested elements. - /// // Note, however, that this method does not handle namespaces. - /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); - /// - /// // At the end we should get an Eof event, because we ate the whole XML - /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); - /// ``` - /// - /// [`Start`]: Event::Start - /// [`End`]: Event::End - /// [`read_to_end()`]: Self::read_to_end - /// [`check_end_names`]: Self::check_end_names - /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { - let mut depth = 0; - loop { - buf.clear(); - match self.read_event_into(buf) { - Err(e) => return Err(e), - - Ok(Event::Start(e)) if e.name() == end => depth += 1, - Ok(Event::End(e)) if e.name() == end => { - if depth == 0 { - return Ok(()); - } - depth -= 1; - } - Ok(Event::Eof) => { - let name = self.decoder().decode(end.as_ref()); - return Err(Error::UnexpectedEof(format!("", name))); - } - _ => (), - } - } - } - - /// Reads optional text between start and end tags. - /// - /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a - /// `String`. If the next event is an [`End`] event, returns the empty string. In all other - /// cases, returns an error. - /// - /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 - /// if none is specified). - /// - /// # Examples - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::Reader; - /// use quick_xml::events::Event; - /// - /// let mut xml = Reader::from_reader(b" - /// <b> - /// - /// " as &[u8]); - /// xml.trim_text(true); - /// - /// let expected = ["", ""]; - /// for &content in expected.iter() { - /// match xml.read_event_into(&mut Vec::new()) { - /// Ok(Event::Start(ref e)) => { - /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); - /// }, - /// e => panic!("Expecting Start event, found {:?}", e), - /// } - /// } - /// ``` - /// - /// [`Text`]: Event::Text - /// [`End`]: Event::End - pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result { - let s = match self.read_event_into(buf) { - Err(e) => return Err(e), - - Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), - Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), - Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), - _ => return Err(Error::TextNotFound), - }; - self.read_to_end_into(end, buf)?; - Ok(s) - } -} - -/// Private methods +/// Common parsing code for all reader implementations. impl Reader { - /// Read text into the given buffer, and return an event that borrows from - /// either that buffer or from the input itself, based on the type of the - /// reader. - fn read_event_impl<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - let event = match self.tag_state { - TagState::Init => self.read_until_open(buf, true), - TagState::Closed => self.read_until_open(buf, false), - TagState::Opened => self.read_until_close(buf), - TagState::Empty => self.close_expanded_empty(), - TagState::Exit => return Ok(Event::Eof), - }; - match event { - Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, - _ => {} - } - event - } - - /// Read until '<' is found and moves reader to an `Opened` state. - /// - /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise - fn read_until_open<'i, B>(&mut self, buf: B, first: bool) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Opened; - - if self.trim_text_start { - self.reader.skip_whitespace(&mut self.buf_position)?; - } - - // If we already at the `<` symbol, do not try to return an empty Text event - if self.reader.skip_one(b'<', &mut self.buf_position)? { - return self.read_event_impl(buf); - } - - match self - .reader - .read_bytes_until(b'<', buf, &mut self.buf_position) - { - Ok(Some(bytes)) => { - #[cfg(feature = "encoding")] - if first && self.encoding.can_be_refined() { - if let Some(encoding) = detect_encoding(bytes) { - self.encoding = EncodingRef::BomDetected(encoding); - } - } - - let content = if self.trim_text_end { - // Skip the ending '< - let len = bytes - .iter() - .rposition(|&b| !is_whitespace(b)) - .map_or_else(|| bytes.len(), |p| p + 1); - &bytes[..len] - } else { - bytes - }; - - Ok(if first { - Event::StartText(BytesText::from_escaped(content).into()) - } else { - Event::Text(BytesText::from_escaped(content)) - }) - } - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } - } - - /// Private function to read until `>` is found. This function expects that - /// it was called just after encounter a `<` symbol. - fn read_until_close<'i, B>(&mut self, buf: B) -> Result> - where - R: XmlSource<'i, B>, - { - self.tag_state = TagState::Closed; - - match self.reader.peek_one() { - // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_end(bytes), - Err(e) => Err(e), - }, - // ` match self - .reader - .read_bytes_until(b'>', buf, &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_question_mark(bytes), - Err(e) => Err(e), - }, - // `<...` - opening or self-closed tag - Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_start(bytes), - Err(e) => Err(e), - }, - Ok(None) => Ok(Event::Eof), - Err(e) => Err(e), - } - } - /// reads `BytesElement` starting with a `!`, /// return `Comment`, `CData` or `DocType` event fn read_bang<'b>(&mut self, bang_type: BangType, buf: &'b [u8]) -> Result> { @@ -1015,556 +617,49 @@ impl Reader { Ok(Event::Start(BytesStart::borrowed(buf, name_end))) } } -} - -impl Reader> { - /// Creates an XML reader from a file path. - pub fn from_file>(path: P) -> Result { - let file = File::open(path).map_err(Error::Io)?; - let reader = BufReader::new(file); - Ok(Self::from_reader(reader)) - } -} - -impl<'a> Reader<&'a [u8]> { - /// Creates an XML reader from a string slice. - pub fn from_str(s: &'a str) -> Self { - // Rust strings are guaranteed to be UTF-8, so lock the encoding - #[cfg(feature = "encoding")] - { - let mut reader = Self::from_reader(s.as_bytes()); - reader.encoding = EncodingRef::Explicit(UTF_8); - reader - } - - #[cfg(not(feature = "encoding"))] - Self::from_reader(s.as_bytes()) - } - /// Creates an XML reader from a slice of bytes. - pub fn from_bytes(s: &'a [u8]) -> Self { - Self::from_reader(s) - } - - /// Read an event that borrows from the input rather than a buffer. - #[inline] - pub fn read_event(&mut self) -> Result> { - self.read_event_impl(()) - } - - /// Reads until end element is found. This function is supposed to be called - /// after you already read a [`Start`] event. - /// - /// Manages nested cases where parent and child elements have the same name. - /// - /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] - /// will be returned. In particularly, that error will be returned if you call - /// this method without consuming the corresponding [`Start`] event first. - /// - /// The `end` parameter should contain name of the end element _in the reader - /// encoding_. It is good practice to always get that parameter using - /// [`BytesStart::to_end()`] method. - /// - /// The correctness of the skipped events does not checked, if you disabled - /// the [`check_end_names`] option. - /// - /// # Namespaces - /// - /// While the [`Reader`] does not support namespace resolution, namespaces - /// does not change the algorithm for comparing names. Although the names - /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the - /// same namespace, are semantically equivalent, `` cannot close - /// ``, because according to [the specification] - /// - /// > The end of every element that begins with a **start-tag** MUST be marked - /// > by an **end-tag** containing a name that echoes the element's type as - /// > given in the **start-tag** - /// - /// # Examples - /// - /// This example shows, how you can skip XML content after you read the - /// start event. - /// - /// ``` - /// # use pretty_assertions::assert_eq; - /// use quick_xml::events::{BytesStart, Event}; - /// use quick_xml::Reader; - /// - /// let mut reader = Reader::from_str(r#" - /// - /// - /// - /// - /// - /// - /// - /// - /// "#); - /// reader.trim_text(true); - /// - /// let start = BytesStart::borrowed_name(b"outer"); - /// let end = start.to_end().into_owned(); - /// - /// // First, we read a start event... - /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); - /// - /// //...then, we could skip all events to the corresponding end event. - /// // This call will correctly handle nested elements. - /// // Note, however, that this method does not handle namespaces. - /// reader.read_to_end(end.name()).unwrap(); - /// - /// // At the end we should get an Eof event, because we ate the whole XML - /// assert_eq!(reader.read_event().unwrap(), Event::Eof); - /// ``` - /// - /// [`Start`]: Event::Start - /// [`End`]: Event::End - /// [`check_end_names`]: Self::check_end_names - /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag - pub fn read_to_end(&mut self, end: QName) -> Result<()> { - let mut depth = 0; - loop { - match self.read_event() { - Err(e) => return Err(e), - - Ok(Event::Start(e)) if e.name() == end => depth += 1, - Ok(Event::End(e)) if e.name() == end => { - if depth == 0 { - return Ok(()); - } - depth -= 1; - } - Ok(Event::Eof) => { - let name = self.decoder().decode(end.as_ref()); - return Err(Error::UnexpectedEof(format!("", name))); - } - _ => (), - } - } - } -} - -/// Represents an input for a reader that can return borrowed data. -/// -/// There are two implementors of this trait: generic one that read data from -/// `Self`, copies some part of it into a provided buffer of type `B` and then -/// returns data that borrow from that buffer. -/// -/// The other implementor is for `&[u8]` and instead of copying data returns -/// borrowed data from `Self` instead. This implementation allows zero-copy -/// deserialization. -/// -/// # Parameters -/// - `'r`: lifetime of a buffer from which events will borrow -/// - `B`: a type of a buffer that can be used to store data read from `Self` and -/// from which events can borrow -trait XmlSource<'r, B> { - /// Read input until `byte` is found or end of input is reached. - /// - /// Returns a slice of data read up to `byte`, which does not include into result. - /// If input (`Self`) is exhausted, returns `None`. - /// - /// # Example - /// - /// ```ignore - /// let mut position = 0; - /// let mut input = b"abc*def".as_ref(); - /// // ^= 4 - /// - /// assert_eq!( - /// input.read_bytes_until(b'*', (), &mut position).unwrap(), - /// Some(b"abc".as_ref()) - /// ); - /// assert_eq!(position, 4); // position after the symbol matched - /// ``` - /// - /// # Parameters - /// - `byte`: Byte for search - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bytes_until( - &mut self, - byte: u8, - buf: B, - position: &mut usize, - ) -> Result>; - - /// Read input until comment, CDATA or processing instruction is finished. - /// - /// This method expect that `<` already was read. - /// - /// Returns a slice of data read up to end of comment, CDATA or processing - /// instruction (`>`), which does not include into result. - /// - /// If input (`Self`) is exhausted and nothing was read, returns `None`. - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [events]: crate::events::Event - fn read_bang_element( - &mut self, - buf: B, - position: &mut usize, - ) -> Result>; - - /// Read input until XML element is closed by approaching a `>` symbol. - /// Returns `Some(buffer)` that contains a data between `<` and `>` or - /// `None` if end-of-input was reached and nothing was read. - /// - /// Derived from `read_until`, but modified to handle XML attributes - /// using a minimal state machine. - /// - /// Attribute values are [defined] as follows: - /// ```plain - /// AttValue := '"' (([^<&"]) | Reference)* '"' - /// | "'" (([^<&']) | Reference)* "'" - /// ``` - /// (`Reference` is something like `"`, but we don't care about - /// escaped characters at this level) - /// - /// # Parameters - /// - `buf`: Buffer that could be filled from an input (`Self`) and - /// from which [events] could borrow their data - /// - `position`: Will be increased by amount of bytes consumed - /// - /// [defined]: https://www.w3.org/TR/xml11/#NT-AttValue - /// [events]: crate::events::Event - fn read_element(&mut self, buf: B, position: &mut usize) -> Result>; - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()>; - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result; - - fn peek_one(&mut self) -> Result>; -} - -/// Implementation of `XmlSource` for any `BufRead` reader using a user-given -/// `Vec` as buffer that will be borrowed by events. -impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec> for R { - #[inline] - fn read_bytes_until( - &mut self, - byte: u8, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - let mut read = 0; - let mut done = false; - let start = buf.len(); - while !done { - let used = { - let available = match self.fill_buf() { - Ok(n) if n.is_empty() => break, - Ok(n) => n, - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } - }; - - match memchr::memchr(byte, available) { - Some(i) => { - buf.extend_from_slice(&available[..i]); - done = true; - i + 1 - } - None => { - buf.extend_from_slice(available); - available.len() - } - } - }; - self.consume(used); - read += used; - } - *position += read; - - if read == 0 { - Ok(None) - } else { - Ok(Some(&buf[start..])) - } - } - - fn read_bang_element( + fn resolve_namespaced_event_inner<'b, 'ns>( &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - let start = buf.len(); - let mut read = 1; - buf.push(b'!'); - self.consume(1); - - let bang_type = BangType::new(self.peek_one()?)?; - - loop { - match self.fill_buf() { - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Ok(n) if n.is_empty() => return Err(bang_type.to_err()), - Ok(available) => { - if let Some((consumed, used)) = bang_type.parse(available, read) { - buf.extend_from_slice(consumed); - - self.consume(used); - read += used; - - *position += read; - break; - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self.consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } + event: Result>, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'b>)> { + match event { + Ok(Event::Eof) => Ok((ResolveResult::Unbound, Event::Eof)), + Ok(Event::Start(e)) => { + self.ns_resolver.push(&e, namespace_buffer); + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Start(e), + )) } - } - - if read == 0 { - Ok(None) - } else { - Ok(Some((bang_type, &buf[start..]))) - } - } - - #[inline] - fn read_element( - &mut self, - buf: &'b mut Vec, - position: &mut usize, - ) -> Result> { - let mut state = ReadElementState::Elem; - let mut read = 0; - - let start = buf.len(); - loop { - match self.fill_buf() { - Ok(n) if n.is_empty() => break, - Ok(available) => { - if let Some((consumed, used)) = state.change(available) { - buf.extend_from_slice(consumed); - - self.consume(used); - read += used; - - *position += read; - break; - } else { - buf.extend_from_slice(available); - - let used = available.len(); - self.consume(used); - read += used; - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => { - *position += read; - return Err(Error::Io(e)); - } - }; - } - - if read == 0 { - Ok(None) - } else { - Ok(Some(&buf[start..])) - } - } - - /// Consume and discard all the whitespace until the next non-whitespace - /// character or EOF. - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - loop { - break match self.fill_buf() { - Ok(n) => { - let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); - if count > 0 { - self.consume(count); - *position += count; - continue; - } else { - Ok(()) - } - } - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e)), - }; - } - } - - /// Consume and discard one character if it matches the given byte. Return - /// true if it matched. - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - match self.peek_one()? { - Some(b) if b == byte => { - *position += 1; - self.consume(1); - Ok(true) + Ok(Event::Empty(e)) => { + // For empty elements we need to 'artificially' keep the namespace scope on the + // stack until the next `next()` call occurs. + // Otherwise the caller has no chance to use `resolve` in the context of the + // namespace declarations that are 'in scope' for the empty element alone. + // Ex: + self.ns_resolver.push(&e, namespace_buffer); + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::Empty(e), + )) } - _ => Ok(false), - } - } - - /// Return one character without consuming it, so that future `read_*` calls - /// will still include it. On EOF, return None. - fn peek_one(&mut self) -> Result> { - loop { - break match self.fill_buf() { - Ok(n) if n.is_empty() => Ok(None), - Ok(n) => Ok(Some(n[0])), - Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, - Err(e) => Err(Error::Io(e)), - }; - } - } -} - -/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer -/// that will be borrowed by events. This implementation provides a zero-copy deserialization -impl<'a> XmlSource<'a, ()> for &'a [u8] { - fn read_bytes_until( - &mut self, - byte: u8, - _buf: (), - position: &mut usize, - ) -> Result> { - if self.is_empty() { - return Ok(None); - } - - Ok(Some(if let Some(i) = memchr::memchr(byte, self) { - *position += i + 1; - let bytes = &self[..i]; - *self = &self[i + 1..]; - bytes - } else { - *position += self.len(); - let bytes = &self[..]; - *self = &[]; - bytes - })) - } - - fn read_bang_element( - &mut self, - _buf: (), - position: &mut usize, - ) -> Result> { - // Peeked one bang ('!') before being called, so it's guaranteed to - // start with it. - debug_assert_eq!(self[0], b'!'); - - let bang_type = BangType::new(self[1..].first().copied())?; - - if let Some((bytes, i)) = bang_type.parse(self, 0) { - *position += i; - *self = &self[i..]; - return Ok(Some((bang_type, bytes))); - } - - // Note: Do not update position, so the error points to - // somewhere sane rather than at the EOF - Err(bang_type.to_err()) - } - - fn read_element(&mut self, _buf: (), position: &mut usize) -> Result> { - if self.is_empty() { - return Ok(None); - } - - let mut state = ReadElementState::Elem; - - if let Some((bytes, i)) = state.change(self) { - *position += i; - *self = &self[i..]; - return Ok(Some(bytes)); - } - - // Note: Do not update position, so the error points to a sane place - // rather than at the EOF. - Err(Error::UnexpectedEof("Element".to_string())) - - // FIXME: Figure out why the other one works without UnexpectedEof - } - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - let whitespaces = self - .iter() - .position(|b| !is_whitespace(*b)) - .unwrap_or(self.len()); - *position += whitespaces; - *self = &self[whitespaces..]; - Ok(()) - } - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - if self.first() == Some(&byte) { - *self = &self[1..]; - *position += 1; - Ok(true) - } else { - Ok(false) + Ok(Event::End(e)) => { + // notify next `read_namespaced_event()` invocation that it needs to pop this + // namespace scope + self.pending_pop = true; + Ok(( + self.ns_resolver.find(e.name(), namespace_buffer), + Event::End(e), + )) + } + Ok(e) => Ok((ResolveResult::Unbound, e)), + Err(e) => Err(e), } } - - fn peek_one(&mut self) -> Result> { - Ok(self.first().copied()) - } -} - -/// This is just a helper implementation for using `&mut ()` as buffer while reading from an -/// `&[u8]` to unify how the `check!` macro below works. -impl<'a, 'b> XmlSource<'a, &'b mut ()> for &'a [u8] { - fn read_bytes_until( - &mut self, - byte: u8, - _buf: &mut (), - position: &mut usize, - ) -> Result> { - self.read_bytes_until(byte, (), position) - } - - fn read_bang_element( - &mut self, - _buf: &mut (), - position: &mut usize, - ) -> Result> { - self.read_bang_element((), position) - } - - fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { - self.read_element((), position) - } - - fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { - >::skip_whitespace(self, position) - } - - fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { - >::skip_one(self, byte, position) - } - - fn peek_one(&mut self) -> Result> { - >::peek_one(self) - } } /// Possible elements started with ` { mod read_bytes_until { - use crate::reader::XmlSource; + use super::input_from_bytes; // Use Bytes for printing bytes as strings for ASCII range use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1865,7 +960,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"".as_ref(); + let mut input = input_from_bytes(b"".as_ref()); // ^= 0 assert_eq!( @@ -1884,7 +979,7 @@ mod test { fn non_existent() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abcdef".as_ref(); + let mut input = input_from_bytes(b"abcdef".as_ref()); // ^= 6 assert_eq!( @@ -1904,7 +999,7 @@ mod test { fn at_the_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"*abcdef".as_ref(); + let mut input = input_from_bytes(b"*abcdef".as_ref()); // ^= 1 assert_eq!( @@ -1924,7 +1019,7 @@ mod test { fn inside() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abc*def".as_ref(); + let mut input = input_from_bytes(b"abc*def".as_ref()); // ^= 4 assert_eq!( @@ -1944,7 +1039,7 @@ mod test { fn in_the_end() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"abcdef*".as_ref(); + let mut input = input_from_bytes(b"abcdef*".as_ref()); // ^= 7 assert_eq!( @@ -1959,10 +1054,12 @@ mod test { } mod read_bang_element { + use super::input_from_bytes; /// Checks that reading CDATA content works correctly mod cdata { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -1973,7 +1070,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![]]>other content".as_ref(); + let mut input = input_from_bytes(b"![]]>other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -1993,7 +1090,7 @@ mod test { fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[other content".as_ref(); + let mut input = input_from_bytes(b"![CDATA[other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2012,7 +1109,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[]]>other content".as_ref(); + let mut input = input_from_bytes(b"![CDATA[]]>other content".as_ref()); // ^= 11 assert_eq!( @@ -2032,7 +1129,7 @@ mod test { fn with_content() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref(); + let mut input = input_from_bytes(b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref()); // ^= 28 assert_eq!( @@ -2063,8 +1160,9 @@ mod test { /// /// [specification]: https://www.w3.org/TR/xml11/#dt-comment mod comment { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2073,7 +1171,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!- -->other content".as_ref(); + let mut input = input_from_bytes(b"!- -->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2091,7 +1189,7 @@ mod test { fn not_properly_end() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!->other content".as_ref(); + let mut input = input_from_bytes(b"!->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2109,7 +1207,7 @@ mod test { fn not_closed1() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--other content".as_ref(); + let mut input = input_from_bytes(b"!--other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2127,7 +1225,7 @@ mod test { fn not_closed2() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!-->other content".as_ref(); + let mut input = input_from_bytes(b"!-->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2145,7 +1243,7 @@ mod test { fn not_closed3() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--->other content".as_ref(); + let mut input = input_from_bytes(b"!--->other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2163,7 +1261,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!---->other content".as_ref(); + let mut input = input_from_bytes(b"!---->other content".as_ref()); // ^= 6 assert_eq!( @@ -2180,7 +1278,7 @@ mod test { fn with_content() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!--->comment<--->other content".as_ref(); + let mut input = input_from_bytes(b"!--->comment<--->other content".as_ref()); // ^= 17 assert_eq!( @@ -2196,9 +1294,11 @@ mod test { /// Checks that reading DOCTYPE definition works correctly mod doctype { + use super::input_from_bytes; mod uppercase { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2206,7 +1306,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!D other content".as_ref(); + let mut input = input_from_bytes(b"!D other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2224,7 +1324,7 @@ mod test { fn without_space() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPEother content".as_ref(); + let mut input = input_from_bytes(b"!DOCTYPEother content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2242,7 +1342,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPE>other content".as_ref(); + let mut input = input_from_bytes(b"!DOCTYPE>other content".as_ref()); // ^= 9 assert_eq!( @@ -2259,7 +1359,7 @@ mod test { fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!DOCTYPE other content".as_ref(); + let mut input = input_from_bytes(b"!DOCTYPE other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2275,8 +1375,9 @@ mod test { } mod lowercase { + use super::input_from_bytes; use crate::errors::Error; - use crate::reader::{BangType, XmlSource}; + use crate::reader::BangType; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2284,7 +1385,7 @@ mod test { fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!d other content".as_ref(); + let mut input = input_from_bytes(b"!d other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2302,7 +1403,7 @@ mod test { fn without_space() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctypeother content".as_ref(); + let mut input = input_from_bytes(b"!doctypeother content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2320,7 +1421,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctype>other content".as_ref(); + let mut input = input_from_bytes(b"!doctype>other content".as_ref()); // ^= 9 assert_eq!( @@ -2337,7 +1438,7 @@ mod test { fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"!doctype other content".as_ref(); + let mut input = input_from_bytes(b"!doctype other content".as_ref()); // ^= 0 match input.read_bang_element($(&mut $buf, )? &mut position) { @@ -2355,7 +1456,7 @@ mod test { } mod read_element { - use crate::reader::XmlSource; + use super::input_from_bytes; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2364,7 +1465,7 @@ mod test { fn empty() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"".as_ref(); + let mut input = input_from_bytes(b"".as_ref()); // ^= 0 assert_eq!(input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), None); @@ -2372,7 +1473,7 @@ mod test { } mod open { - use crate::reader::XmlSource; + use super::input_from_bytes; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2380,7 +1481,7 @@ mod test { fn empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b">".as_ref(); + let mut input = input_from_bytes(b">".as_ref()); // ^= 1 assert_eq!( @@ -2394,7 +1495,7 @@ mod test { fn normal() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"tag>".as_ref(); + let mut input = input_from_bytes(b"tag>".as_ref()); // ^= 4 assert_eq!( @@ -2408,7 +1509,7 @@ mod test { fn empty_ns_empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":>".as_ref(); + let mut input = input_from_bytes(b":>".as_ref()); // ^= 2 assert_eq!( @@ -2422,7 +1523,7 @@ mod test { fn empty_ns() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":tag>".as_ref(); + let mut input = input_from_bytes(b":tag>".as_ref()); // ^= 5 assert_eq!( @@ -2436,7 +1537,7 @@ mod test { fn with_attributes() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref(); + let mut input = input_from_bytes(br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref()); // ^= 38 assert_eq!( @@ -2448,7 +1549,7 @@ mod test { } mod self_closed { - use crate::reader::XmlSource; + use super::input_from_bytes; use crate::utils::Bytes; use pretty_assertions::assert_eq; @@ -2456,7 +1557,7 @@ mod test { fn empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"/>".as_ref(); + let mut input = input_from_bytes(b"/>".as_ref()); // ^= 2 assert_eq!( @@ -2470,7 +1571,7 @@ mod test { fn normal() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b"tag/>".as_ref(); + let mut input = input_from_bytes(b"tag/>".as_ref()); // ^= 5 assert_eq!( @@ -2484,7 +1585,7 @@ mod test { fn empty_ns_empty_tag() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":/>".as_ref(); + let mut input = input_from_bytes(b":/>".as_ref()); // ^= 3 assert_eq!( @@ -2498,7 +1599,7 @@ mod test { fn empty_ns() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = b":tag/>".as_ref(); + let mut input = input_from_bytes(b":tag/>".as_ref()); // ^= 6 assert_eq!( @@ -2512,7 +1613,7 @@ mod test { fn with_attributes() { $(let mut $buf = $init;)? let mut position = 0; - let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref(); + let mut input = input_from_bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref()); // ^= 41 assert_eq!( @@ -2525,12 +1626,13 @@ mod test { } mod issue_344 { + use super::reader_from_str; use crate::errors::Error; #[test] fn cdata() { let doc = "![]]>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -2546,7 +1648,7 @@ mod test { #[test] fn comment() { let doc = "!- -->"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -2562,7 +1664,7 @@ mod test { #[test] fn doctype_uppercase() { let doc = "!D>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -2578,7 +1680,7 @@ mod test { #[test] fn doctype_lowercase() { let doc = "!d>"; - let mut reader = crate::Reader::from_str(doc); + let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? match reader.read_until_close($(&mut $buf)?) { @@ -2594,13 +1696,13 @@ mod test { /// Ensures, that no empty `Text` events are generated mod read_event_impl { + use super::reader_from_str; use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; - use crate::reader::Reader; use pretty_assertions::assert_eq; #[test] fn start_text() { - let mut reader = Reader::from_str("bom"); + let mut reader = reader_from_str("bom"); $(let mut $buf = $init;)? assert_eq!( @@ -2611,7 +1713,7 @@ mod test { #[test] fn declaration() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2622,7 +1724,7 @@ mod test { #[test] fn doctype() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2633,7 +1735,7 @@ mod test { #[test] fn processing_instruction() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2644,7 +1746,7 @@ mod test { #[test] fn start() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2655,7 +1757,7 @@ mod test { #[test] fn end() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); // Because we expect invalid XML, do not check that // the end name paired with the start name reader.check_end_names(false); @@ -2669,7 +1771,7 @@ mod test { #[test] fn empty() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2681,7 +1783,7 @@ mod test { /// Text event cannot be generated without preceding event of another type #[test] fn text() { - let mut reader = Reader::from_str("text"); + let mut reader = reader_from_str("text"); $(let mut $buf = $init;)? assert_eq!( @@ -2697,7 +1799,7 @@ mod test { #[test] fn cdata() { - let mut reader =Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2708,7 +1810,7 @@ mod test { #[test] fn comment() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2719,7 +1821,7 @@ mod test { #[test] fn eof() { - let mut reader = Reader::from_str(""); + let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( @@ -2731,19 +1833,19 @@ mod test { #[cfg(feature = "encoding")] mod encoding { + use super::reader_from_bytes; use crate::events::Event; - use crate::reader::Reader; use encoding_rs::{UTF_8, UTF_16LE, WINDOWS_1251}; - use pretty_assertions::assert_eq; mod bytes { + use super::reader_from_bytes; use super::*; use pretty_assertions::assert_eq; /// Checks that encoding is detected by BOM and changed after XML declaration #[test] fn bom_detected() { - let mut reader = Reader::from_bytes(b"\xFF\xFE"); + let mut reader = reader_from_bytes(b"\xFF\xFE"); $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); @@ -2759,7 +1861,7 @@ mod test { /// Checks that encoding is changed by XML declaration, but only once #[test] fn xml_declaration() { - let mut reader = Reader::from_bytes(b""); + let mut reader = reader_from_bytes(b""); $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); @@ -2772,31 +1874,31 @@ mod test { assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); } } - - /// Checks that XML declaration cannot change the encoding from UTF-8 if - /// a `Reader` was created using `from_str` method - #[test] - fn str_always_has_utf8() { - let mut reader = Reader::from_str(""); - $(let mut $buf = $init;)? - - assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($(&mut $buf)?).unwrap(); - assert_eq!(reader.decoder().encoding(), UTF_8); - - assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); - } } }; } - /// Tests for reader that generates events that borrow from the provided buffer - mod buffered { - check!(let mut buf = Vec::new();); - } + pub(super) use check; - /// Tests for reader that generates events that borrow from the input - mod borrowed { - check!(let mut buf = ();); + #[cfg(feature = "encoding")] + mod encoding { + use crate::events::Event; + use crate::reader::UTF_8; + use pretty_assertions::assert_eq; + /// Checks that XML declaration cannot change the encoding from UTF-8 if + /// a `Reader` was created using `from_str` method. + /// This is outside the `check` macro as this is only relevant for the + /// `Reader::from_str` method. + #[test] + fn str_always_has_utf8() { + let mut reader = crate::Reader::from_str(""); + let mut buf = Vec::new(); + + assert_eq!(reader.decoder().encoding(), UTF_8); + reader.read_event_into(&mut buf).unwrap(); + assert_eq!(reader.decoder().encoding(), UTF_8); + + assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + } } } diff --git a/src/reader/io_reader.rs b/src/reader/io_reader.rs new file mode 100644 index 00000000..589f573d --- /dev/null +++ b/src/reader/io_reader.rs @@ -0,0 +1,662 @@ +//! This is an implementation of [`Reader`] for reading from a [`Read`] or [`BufRead`] as +//! underlying byte stream. + +use std::fs::File; +use std::io::{self, BufRead, BufReader, Read}; +use std::ops::{Deref, DerefMut}; +use std::path::Path; + +use crate::events::{BytesText, Event}; +use crate::name::{QName, ResolveResult}; +use crate::{Error, Result}; + +#[cfg(feature = "encoding")] +use super::{detect_encoding, EncodingRef}; +use super::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState}; + +/// A struct for handling reading functions based on reading from a [`BufRead`]. +#[derive(Debug, Clone)] +pub struct IoReader(R); + +impl Deref for IoReader { + type Target = R; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for IoReader { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl InnerReader for IoReader { + type Reader = R; + + fn into_inner(self) -> Self::Reader { + self.0 + } +} + +/// Private reading functions. +impl IoReader { + #[inline] + fn read_bytes_until<'buf>( + &mut self, + byte: u8, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + let mut read = 0; + let mut done = false; + let start = buf.len(); + while !done { + let used = { + let available = match self.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + match memchr::memchr(byte, available) { + Some(i) => { + buf.extend_from_slice(&available[..i]); + done = true; + i + 1 + } + None => { + buf.extend_from_slice(available); + available.len() + } + } + }; + self.consume(used); + read += used; + } + *position += read; + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + fn read_bang_element<'buf>( + &mut self, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + let start = buf.len(); + let mut read = 1; + buf.push(b'!'); + self.consume(1); + + let bang_type = BangType::new(self.peek_one()?)?; + + loop { + match self.fill_buf() { + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + Ok(n) if n.is_empty() => return Err(bang_type.to_err()), + Ok(available) => { + if let Some((consumed, used)) = bang_type.parse(available, read) { + buf.extend_from_slice(consumed); + + self.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + } + } + + if read == 0 { + Ok(None) + } else { + Ok(Some((bang_type, &buf[start..]))) + } + } + + #[inline] + fn read_element<'buf>( + &mut self, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + let mut state = ReadElementState::Elem; + let mut read = 0; + + let start = buf.len(); + loop { + match self.fill_buf() { + Ok(n) if n.is_empty() => break, + Ok(available) => { + if let Some((consumed, used)) = state.change(available) { + buf.extend_from_slice(consumed); + + self.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + } + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + /// Consume and discard all the whitespace until the next non-whitespace + /// character or EOF. + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + loop { + break match self.fill_buf() { + Ok(n) => { + let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); + if count > 0 { + self.consume(count); + *position += count; + continue; + } else { + Ok(()) + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + /// Consume and discard one character if it matches the given byte. Return + /// true if it matched. + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + match self.peek_one()? { + Some(b) if b == byte => { + *position += 1; + self.consume(1); + Ok(true) + } + _ => Ok(false), + } + } + + /// Return one character without consuming it, so that future `read_*` calls + /// will still include it. On EOF, return None. + fn peek_one(&mut self) -> Result> { + loop { + break match self.fill_buf() { + Ok(n) if n.is_empty() => Ok(None), + Ok(n) => Ok(Some(n[0])), + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } +} + +/// Private functions for a [`Reader`] based on an [`IoReader`]. +impl Reader> { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_impl<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + let event = match self.tag_state { + TagState::Init => self.read_until_open(buf, true), + TagState::Closed => self.read_until_open(buf, false), + TagState::Opened => self.read_until_close(buf), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open<'buf>( + &mut self, + buf: &'buf mut Vec, + first: bool, + ) -> Result> { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_impl(buf); + } + + match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + { + Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first && self.encoding.can_be_refined() { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + } + } + + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + self.tag_state = TagState::Closed; + + match self.reader.peek_one() { + // ` match self.reader.read_bang_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } +} + +/// Builder for reading from a file. +impl Reader>> { + /// Creates an XML reader from a file path. + pub fn from_file>(path: P) -> Result { + let file = File::open(path).map_err(Error::Io)?; + let reader = BufReader::new(file); + Ok(Self::from_reader_internal(IoReader(reader))) + } +} + +/// Builder for reading from any [`BufRead`]. +impl Reader> { + /// Creates an XML reader from any type implementing [`BufRead`]. + pub fn from_reader(reader: R) -> Self { + Self::from_reader_internal(IoReader(reader)) + } +} + +/// Builder for reading from any [`Read`]. +impl Reader>> { + /// Creates an XML reader from any type implementing [`Read`]. + pub fn from_unbuffered_reader(reader: R) -> Self { + Self::from_reader_internal(IoReader(BufReader::new(reader))) + } +} + +/// Public reading methods for a [`Reader`] based on an [`IoReader`]. +impl Reader> { + /// Reads the next `Event`. + /// + /// This is the main entry point for reading XML `Event`s. + /// + /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow` + /// internally). + /// + /// Having the possibility to control the internal buffers gives you some additional benefits + /// such as: + /// + /// - Reduce the number of allocations by reusing the same buffer. For constrained systems, + /// you can call `buf.clear()` once you are done with processing the event (typically at the + /// end of your loop). + /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`). + /// + /// # Examples + /// + /// ``` + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// // This explicitly uses `from_reader(xml.as_bytes())` to use a buffered reader instead of + /// // relying on the zero-copy optimizations for reading from byte slices. + /// let mut reader = Reader::from_reader(xml.as_bytes()); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_event_into(&mut buf) { + /// Ok(Event::Start(ref e)) => count += 1, + /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok(Event::Eof) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + #[inline] + pub fn read_event_into<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + self.read_event_impl(buf) + } + + /// Reads until end element is found using provided buffer as intermediate + /// storage for events content. This function is supposed to be called after + /// you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] + /// will be returned. In particularly, that error will be returned if you call + /// this method without consuming the corresponding [`Start`] event first. + /// + /// If your reader created from a string slice or byte array slice, it is + /// better to use [`read_to_end()`] method, because it will not copy bytes + /// into intermediate buffer. + /// + /// The provided `buf` buffer will be filled only by one event content at time. + /// Before reading of each event the buffer will be cleared. If you know an + /// appropriate size of each event, you can preallocate the buffer to reduce + /// number of reallocations. + /// + /// The `end` parameter should contain name of the end element _in the reader + /// encoding_. It is good practice to always get that parameter using + /// [`BytesStart::to_end()`] method. + /// + /// The correctness of the skipped events does not checked, if you disabled + /// the [`check_end_names`] option. + /// + /// # Namespaces + /// + /// While the [`Reader`] does not support namespace resolution, namespaces + /// does not change the algorithm for comparing names. Although the names + /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the + /// same namespace, are semantically equivalent, `` cannot close + /// ``, because according to [the specification] + /// + /// > The end of every element that begins with a **start-tag** MUST be marked + /// > by an **end-tag** containing a name that echoes the element's type as + /// > given in the **start-tag** + /// + /// # Examples + /// + /// This example shows, how you can skip XML content after you read the + /// start event. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_reader(r#" + /// + /// + /// + /// + /// + /// + /// + /// + /// "#.as_bytes()); + /// reader.trim_text(true); + /// let mut buf = Vec::new(); + /// + /// let start = BytesStart::borrowed_name(b"outer"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start)); + /// + /// //...then, we could skip all events to the corresponding end event. + /// // This call will correctly handle nested elements. + /// // Note, however, that this method does not handle namespaces. + /// reader.read_to_end_into(end.name(), &mut buf).unwrap(); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`End`]: Event::End + /// [`read_to_end()`]: Self::read_to_end + /// [`check_end_names`]: Self::check_end_names + /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag + pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec) -> Result<()> { + let mut depth = 0; + loop { + buf.clear(); + match self.read_event_into(buf) { + Err(e) => return Err(e), + + Ok(Event::Start(e)) if e.name() == end => depth += 1, + Ok(Event::End(e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Eof) => { + let name = self.decoder().decode(end.as_ref()); + return Err(Error::UnexpectedEof(format!("", name))); + } + _ => (), + } + } + } + + /// Reads optional text between start and end tags. + /// + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. + /// + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let mut xml = Reader::from_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); + /// + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event_into(&mut Vec::new()) { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// } + /// ``` + /// + /// [`Text`]: Event::Text + /// [`End`]: Event::End + pub fn read_text_into(&mut self, end: QName, buf: &mut Vec) -> Result { + let s = match self.read_event_into(buf) { + Err(e) => return Err(e), + + Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), + }; + self.read_to_end_into(end, buf)?; + Ok(s) + } + + /// Reads the next event and resolves its namespace (if applicable). + /// + /// # Examples + /// + /// ``` + /// use std::str::from_utf8; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// use quick_xml::name::ResolveResult::*; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_reader(xml.as_bytes()); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut ns_buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { + /// Ok((Bound(ns), Event::Start(e))) => { + /// count += 1; + /// match (ns.as_ref(), e.local_name().as_ref()) { + /// (b"www.xxxx", b"tag1") => (), + /// (b"www.yyyy", b"tag2") => (), + /// (ns, n) => panic!("Namespace and local name mismatch"), + /// } + /// println!("Resolved namespace: {:?}", ns); + /// } + /// Ok((Unbound, Event::Start(_))) => { + /// panic!("Element not in any namespace") + /// }, + /// Ok((Unknown(p), Event::Start(_))) => { + /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) + /// } + /// Ok((_, Event::Text(e))) => { + /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// }, + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok((_, Event::Eof)) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + pub fn read_namespaced_event<'b, 'ns>( + &mut self, + buf: &'b mut Vec, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'b>)> { + if self.pending_pop { + self.ns_resolver.pop(namespace_buffer); + } + self.pending_pop = false; + let event = self.read_event_into(buf); + self.resolve_namespaced_event_inner(event, namespace_buffer) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::test::check; + + fn input_from_bytes(bytes: &[u8]) -> IoReader<&[u8]> { + IoReader(bytes) + } + + fn reader_from_str(s: &str) -> Reader> { + Reader::from_reader_internal(IoReader(s.as_bytes())) + } + + #[allow(dead_code)] + fn reader_from_bytes(s: &[u8]) -> Reader> { + Reader::from_reader_internal(IoReader(s)) + } + + check!(let mut buf = Vec::new();); +} diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs new file mode 100644 index 00000000..fbdec898 --- /dev/null +++ b/src/reader/slice_reader.rs @@ -0,0 +1,523 @@ +//! This is an implementation of [`Reader`] for reading from a `&[u8]` as +//! underlying byte stream. This implementation supports not using an +//! intermediate buffer as the byte slice itself can be used to borrow from. + +use std::ops::{Deref, DerefMut}; + +#[cfg(feature = "encoding")] +use encoding_rs::UTF_8; + +use crate::events::{BytesText, Event}; +use crate::name::{QName, ResolveResult}; +use crate::{Error, Result}; + +#[cfg(feature = "encoding")] +use super::{detect_encoding, EncodingRef}; +use super::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState}; + +/// A struct for handling reading functions based on reading from a byte slice. +#[derive(Debug, Clone, Copy)] +pub struct SliceReader<'buf>(&'buf [u8]); + +impl<'buf> Deref for SliceReader<'buf> { + type Target = &'buf [u8]; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl<'buf> DerefMut for SliceReader<'buf> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl<'buf> InnerReader for SliceReader<'buf> { + type Reader = &'buf [u8]; + + fn into_inner(self) -> Self::Reader { + self.0 + } +} + +/// Private reading functions for a [`SliceReader`]. +impl<'buf> SliceReader<'buf> { + fn read_bytes_until( + &mut self, + byte: u8, + _buf: &mut (), + position: &mut usize, + ) -> Result> { + if self.is_empty() { + return Ok(None); + } + + Ok(Some(if let Some(i) = memchr::memchr(byte, self) { + *position += i + 1; + let bytes = &self[..i]; + self.0 = &self[i + 1..]; + bytes + } else { + *position += self.len(); + let bytes = &self[..]; + self.0 = &[]; + bytes + })) + } + + fn read_bang_element( + &mut self, + _buf: &mut (), + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + debug_assert_eq!(self[0], b'!'); + + let bang_type = BangType::new(self[1..].first().copied())?; + + if let Some((bytes, i)) = bang_type.parse(self, 0) { + *position += i; + self.0 = &self[i..]; + return Ok(Some((bang_type, bytes))); + } + + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + Err(bang_type.to_err()) + } + + fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { + if self.is_empty() { + return Ok(None); + } + + let mut state = ReadElementState::Elem; + + if let Some((bytes, i)) = state.change(self) { + *position += i; + self.0 = &self[i..]; + return Ok(Some(bytes)); + } + + // Note: Do not update position, so the error points to a sane place + // rather than at the EOF. + Err(Error::UnexpectedEof("Element".to_string())) + + // FIXME: Figure out why the other one works without UnexpectedEof + } + + fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + let whitespaces = self + .iter() + .position(|b| !is_whitespace(*b)) + .unwrap_or(self.len()); + *position += whitespaces; + self.0 = &self[whitespaces..]; + Ok(()) + } + + fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + if self.first() == Some(&byte) { + self.0 = &self[1..]; + *position += 1; + Ok(true) + } else { + Ok(false) + } + } + + fn peek_one(&mut self) -> Result> { + Ok(self.first().copied()) + } +} + +/// Private functions for a [`Reader`] based on a [`SliceReader`]. +impl<'buf> Reader> { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + fn read_event_impl(&mut self, _buf: &mut ()) -> Result> { + let event = match self.tag_state { + TagState::Init => self.read_until_open(&mut (), true), + TagState::Closed => self.read_until_open(&mut (), false), + TagState::Opened => self.read_until_close(&mut ()), + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + fn read_until_open(&mut self, _buf: &mut (), first: bool) -> Result> { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position)?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position)? { + return self.read_event_impl(&mut ()); + } + + match self + .reader + .read_bytes_until(b'<', &mut (), &mut self.buf_position) + { + Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first && self.encoding.can_be_refined() { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + } + } + + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + fn read_until_close(&mut self, _buf: &mut ()) -> Result> { + self.tag_state = TagState::Closed; + + match self.reader.peek_one() { + // ` match self + .reader + .read_bang_element(&mut (), &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` { + match self + .reader + .read_bytes_until(b'>', &mut (), &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + } + } + // ` { + match self + .reader + .read_bytes_until(b'>', &mut (), &mut self.buf_position) + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + } + } + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(&mut (), &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } +} + +/// Builder for reading from a slice of bytes. +impl<'buf> Reader> { + /// Creates an XML reader from a string slice. + pub fn from_str(s: &'buf str) -> Self { + #[cfg_attr(not(feature = "encoding"), allow(unused_mut))] + let mut reader = Self::from_reader_internal(SliceReader(s.as_bytes())); + + // Rust strings are guaranteed to be UTF-8, so lock the encoding + #[cfg(feature = "encoding")] + { + reader.encoding = EncodingRef::Explicit(UTF_8); + } + + reader + } + + /// Creates an XML reader from a slice of bytes. + pub fn from_bytes(s: &'buf [u8]) -> Self { + Self::from_reader_internal(SliceReader(s)) + } +} + +/// Public reading methods for a [`Reader`] based on an [`SliceReader`]. +impl<'buf> Reader> { + /// Read an event that borrows from the input rather than a buffer. + #[inline] + pub fn read_event(&mut self) -> Result> { + self.read_event_impl(&mut ()) + } + + /// Temporary helper to keep both `read_event` and `read_event_into` available for reading + /// from `&[u8]`. + #[inline] + pub fn read_event_into(&mut self, _buf: &mut Vec) -> Result> { + self.read_event() + } + + /// Reads until end element is found. This function is supposed to be called + /// after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] + /// will be returned. In particularly, that error will be returned if you call + /// this method without consuming the corresponding [`Start`] event first. + /// + /// The `end` parameter should contain name of the end element _in the reader + /// encoding_. It is good practice to always get that parameter using + /// [`BytesStart::to_end()`] method. + /// + /// The correctness of the skipped events does not checked, if you disabled + /// the [`check_end_names`] option. + /// + /// # Namespaces + /// + /// While the [`Reader`] does not support namespace resolution, namespaces + /// does not change the algorithm for comparing names. Although the names + /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the + /// same namespace, are semantically equivalent, `` cannot close + /// ``, because according to [the specification] + /// + /// > The end of every element that begins with a **start-tag** MUST be marked + /// > by an **end-tag** containing a name that echoes the element's type as + /// > given in the **start-tag** + /// + /// # Examples + /// + /// This example shows, how you can skip XML content after you read the + /// start event. + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_str(r#" + /// + /// + /// + /// + /// + /// + /// + /// + /// "#); + /// reader.trim_text(true); + /// + /// let start = BytesStart::borrowed_name(b"outer"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event().unwrap(), Event::Start(start)); + /// + /// //...then, we could skip all events to the corresponding end event. + /// // This call will correctly handle nested elements. + /// // Note, however, that this method does not handle namespaces. + /// reader.read_to_end(end.name()).unwrap(); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event().unwrap(), Event::Eof); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`End`]: Event::End + /// [`check_end_names`]: Self::check_end_names + /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag + pub fn read_to_end(&mut self, end: QName) -> Result<()> { + let mut depth = 0; + loop { + match self.read_event() { + Err(e) => return Err(e), + + Ok(Event::Start(e)) if e.name() == end => depth += 1, + Ok(Event::End(e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Eof) => { + let name = self.decoder().decode(end.as_ref()); + return Err(Error::UnexpectedEof(format!("", name))); + } + _ => (), + } + } + } + + /// Temporary helper to keep both `read_to_end` and `read_to_end_into` available for reading + /// from `&[u8]`. + pub fn read_to_end_into(&mut self, end: QName, _buf: &mut Vec) -> Result<()> { + self.read_to_end(end) + } + + /// Reads optional text between start and end tags. + /// + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. + /// + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). + /// + /// # Examples + /// + /// ``` + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let mut xml = Reader::from_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); + /// + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event_into(&mut Vec::new()) { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text_into(e.name(), &mut Vec::new()).unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// } + /// ``` + /// + /// [`Text`]: Event::Text + /// [`End`]: Event::End + pub fn read_text(&mut self, end: QName) -> Result { + let s = match self.read_event() { + Err(e) => return Err(e), + + Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), + }; + self.read_to_end(end)?; + Ok(s) + } + + /// Temporary helper to keep both `read_text` and `read_text_into` available for reading + /// from `&[u8]`. + pub fn read_text_into(&mut self, end: QName, _buf: &mut Vec) -> Result { + self.read_text(end) + } + + /// Reads the next event and resolves its namespace (if applicable). + /// + /// # Examples + /// + /// ``` + /// use std::str::from_utf8; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// use quick_xml::name::ResolveResult::*; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_str(xml); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut ns_buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { + /// Ok((Bound(ns), Event::Start(e))) => { + /// count += 1; + /// match (ns.as_ref(), e.local_name().as_ref()) { + /// (b"www.xxxx", b"tag1") => (), + /// (b"www.yyyy", b"tag2") => (), + /// (ns, n) => panic!("Namespace and local name mismatch"), + /// } + /// println!("Resolved namespace: {:?}", ns); + /// } + /// Ok((Unbound, Event::Start(_))) => { + /// panic!("Element not in any namespace") + /// }, + /// Ok((Unknown(p), Event::Start(_))) => { + /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) + /// } + /// Ok((_, Event::Text(e))) => { + /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// }, + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok((_, Event::Eof)) => break, + /// _ => (), + /// } + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// ``` + pub fn read_namespaced_event<'ns>( + &mut self, + _buf: &mut Vec, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'buf>)> { + if self.pending_pop { + self.ns_resolver.pop(namespace_buffer); + } + self.pending_pop = false; + let event = self.read_event(); + self.resolve_namespaced_event_inner(event, namespace_buffer) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::test::check; + + fn input_from_bytes<'buf>(bytes: &'buf [u8]) -> SliceReader<'buf> { + SliceReader(bytes) + } + + fn reader_from_str<'buf>(s: &'buf str) -> Reader> { + Reader::from_str(s) + } + + #[allow(dead_code)] + fn reader_from_bytes<'buf>(s: &'buf [u8]) -> Reader> { + Reader::from_bytes(s) + } + + check!(let mut buf = ();); +} From b09495a0db79e10ebe864494ab1a62ed3d3a56bb Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 11:26:25 +0200 Subject: [PATCH 3/7] Remove buffered access for SliceReader as events always borrow from the input slice --- README.md | 10 ++-- benches/macrobenches.rs | 5 +- benches/microbenches.rs | 70 +++++++++------------------- examples/custom_entities.rs | 3 +- examples/read_texts.rs | 6 +-- src/lib.rs | 10 ++-- src/reader.rs | 9 ++-- src/reader/slice_reader.rs | 93 ++++++++++--------------------------- src/writer.rs | 4 +- tests/namespaces.rs | 32 ++++++------- tests/test.rs | 14 ++---- tests/unit_tests.rs | 77 ++++++++++-------------------- 12 files changed, 112 insertions(+), 221 deletions(-) diff --git a/README.md b/README.md index 4cbcac0d..0299e615 100644 --- a/README.md +++ b/README.md @@ -30,7 +30,11 @@ let xml = r#" "#; -let mut reader = Reader::from_str(xml); +let mut reader = Reader::from_reader(xml.as_bytes()); +// If you want to read from a string or byte slice without buffering, use: +// let mut reader = Reader::from_str(xml); +// In that case, `Vec` is *not* needed for buffering below and you should use +// `read_event` instead of `read_event_into`. reader.trim_text(true); let mut count = 0; @@ -75,9 +79,8 @@ let xml = r#"text"#; let mut reader = Reader::from_str(xml); reader.trim_text(true); let mut writer = Writer::new(Cursor::new(Vec::new())); -let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf) { + match reader.read_event() { Ok(Event::Start(ref e)) if e.name() == b"this_tag" => { // crates a new element ... alternatively we could reuse `e` by calling @@ -101,7 +104,6 @@ loop { Ok(e) => assert!(writer.write_event(&e).is_ok()), Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), } - buf.clear(); } let result = writer.into_inner().into_inner(); diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 4cb02ffe..a4e2719e 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -19,10 +19,9 @@ static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml"); // TODO: read the namespaces too // TODO: use fully normalized attribute values fn parse_document(doc: &[u8]) -> XmlResult<()> { - let mut r = Reader::from_reader(doc); - let mut buf = Vec::new(); + let mut r = Reader::from_bytes(doc); loop { - match r.read_event_into(&mut buf)? { + match r.read_event()? { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { criterion::black_box(attr?.decode_and_unescape_value(&r)?); diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 8bbe1a67..ee52b27b 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -29,17 +29,15 @@ fn read_event(c: &mut Criterion) { let mut group = c.benchmark_group("read_event"); group.bench_function("trim_text = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -50,19 +48,17 @@ fn read_event(c: &mut Criterion) { group.bench_function("trim_text = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false) .check_comments(false) .trim_text(true); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -79,18 +75,16 @@ fn read_namespaced_event(c: &mut Criterion) { let mut group = c.benchmark_group("read_namespaced_event"); group.bench_function("trim_text = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -101,20 +95,18 @@ fn read_namespaced_event(c: &mut Criterion) { group.bench_function("trim_text = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(SAMPLE); + let mut r = Reader::from_bytes(SAMPLE); r.check_end_names(false) .check_comments(false) .trim_text(true); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); loop { - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((_, Event::Start(_))) | Ok((_, Event::Empty(_))) => count += 1, Ok((_, Event::Eof)) => break, _ => (), } - buf.clear(); } assert_eq!( count, 1550, @@ -130,78 +122,66 @@ fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); group.bench_function("StartText", |b| { let src = "Hello world!".repeat(512 / 12).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false).check_comments(false); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::StartText(e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 504); }) }); group.bench_function("Start", |b| { let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Start(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 525); }) }); group.bench_function("Comment", |b| { let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 520); }) }); group.bench_function("CData", |b| { let src = format!(r#""#, "world".repeat(512 / 5)).into_bytes(); - let mut buf = Vec::with_capacity(1024); b.iter(|| { - let mut r = Reader::from_reader(src.as_ref()); + let mut r = Reader::from_bytes(src.as_ref()); let mut nbtxt = criterion::black_box(0); r.check_end_names(false) .check_comments(false) .trim_text(true); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::CData(ref e)) => nbtxt += e.len(), something_else => panic!("Did not expect {:?}", something_else), }; - buf.clear(); - assert_eq!(nbtxt, 518); }) }); @@ -213,12 +193,11 @@ fn attributes(c: &mut Criterion) { let mut group = c.benchmark_group("attributes"); group.bench_function("with_checks = true", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_bytes(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) => { for attr in e.attributes() { let _attr = attr.unwrap(); @@ -228,7 +207,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 1041); }) @@ -236,12 +214,11 @@ fn attributes(c: &mut Criterion) { group.bench_function("with_checks = false", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_bytes(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) => { for attr in e.attributes().with_checks(false) { let _attr = attr.unwrap(); @@ -251,7 +228,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 1041); }) @@ -259,12 +235,11 @@ fn attributes(c: &mut Criterion) { group.bench_function("try_get_attribute", |b| { b.iter(|| { - let mut r = Reader::from_reader(PLAYERS); + let mut r = Reader::from_bytes(PLAYERS); r.check_end_names(false).check_comments(false); let mut count = criterion::black_box(0); - let mut buf = Vec::new(); loop { - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Event::Empty(e)) if e.name() == QName(b"player") => { for name in ["num", "status", "avg"] { if let Some(_attr) = e.try_get_attribute(name).unwrap() { @@ -279,7 +254,6 @@ fn attributes(c: &mut Criterion) { Ok(Event::Eof) => break, _ => (), } - buf.clear(); } assert_eq!(count, 150); }) diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index 4d59d49e..3c31d4d1 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -27,12 +27,11 @@ fn main() -> Result<(), Box> { let mut reader = Reader::from_str(DATA); reader.trim_text(true); - let mut buf = Vec::new(); let mut custom_entities: HashMap = HashMap::new(); let entity_re = Regex::new(r#""#)?; loop { - match reader.read_event_into(&mut buf) { + match reader.read_event() { Ok(Event::DocType(ref e)) => { for cap in entity_re.captures_iter(&e) { custom_entities.insert( diff --git a/examples/read_texts.rs b/examples/read_texts.rs index 40d71e63..70be0b5c 100644 --- a/examples/read_texts.rs +++ b/examples/read_texts.rs @@ -10,14 +10,13 @@ fn main() { reader.trim_text(true); let mut txt = Vec::new(); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf) { + match reader.read_event() { Ok(Event::Start(ref e)) if e.name().as_ref() == b"tag2" => { txt.push( reader - .read_text_into(QName(b"tag2"), &mut Vec::new()) + .read_text(QName(b"tag2")) .expect("Cannot decode text value"), ); println!("{:?}", txt); @@ -26,6 +25,5 @@ fn main() { Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), _ => (), // There are several other `Event`s we do not consider here } - buf.clear(); } } diff --git a/src/lib.rs b/src/lib.rs index 19723764..26436786 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,7 +35,11 @@ //! //! "#; //! -//! let mut reader = Reader::from_str(xml); +//! let mut reader = Reader::from_reader(xml.as_bytes()); +//! // If you want to read from a string or byte slice without buffering, use: +//! // let mut reader = Reader::from_str(xml); +//! // In that case, `Vec` is *not* needed for buffering below and you should use +//! // `read_event` instead of `read_event_into`. //! reader.trim_text(true); //! //! let mut count = 0; @@ -84,9 +88,8 @@ //! let mut reader = Reader::from_str(xml); //! reader.trim_text(true); //! let mut writer = Writer::new(Cursor::new(Vec::new())); -//! let mut buf = Vec::new(); //! loop { -//! match reader.read_event_into(&mut buf) { +//! match reader.read_event() { //! Ok(Event::Start(ref e)) if e.name().as_ref() == b"this_tag" => { //! //! // crates a new element ... alternatively we could reuse `e` by calling @@ -111,7 +114,6 @@ //! // Ok(e) => assert!(writer.write(&buf).is_ok()), //! Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), //! } -//! buf.clear(); //! } //! //! let result = writer.into_inner().into_inner(); diff --git a/src/reader.rs b/src/reader.rs index c80ae082..fa9c0cdd 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -138,9 +138,8 @@ pub trait InnerReader: Deref + DerefMut { /// reader.trim_text(true); /// let mut count = 0; /// let mut txt = Vec::new(); -/// let mut buf = Vec::new(); /// loop { -/// match reader.read_event_into(&mut buf) { +/// match reader.read_event() { /// Ok(Event::Start(ref e)) => { /// match e.name().as_ref() { /// b"tag1" => println!("attributes values: {:?}", @@ -155,7 +154,6 @@ pub trait InnerReader: Deref + DerefMut { /// Ok(Event::Eof) => break, /// _ => (), /// } -/// buf.clear(); /// } /// ``` #[derive(Clone)] @@ -1892,13 +1890,12 @@ mod test { #[test] fn str_always_has_utf8() { let mut reader = crate::Reader::from_str(""); - let mut buf = Vec::new(); assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_into(&mut buf).unwrap(); + reader.read_event().unwrap(); assert_eq!(reader.decoder().encoding(), UTF_8); - assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof); + assert_eq!(reader.read_event().unwrap(), Event::Eof); } } } diff --git a/src/reader/slice_reader.rs b/src/reader/slice_reader.rs index fbdec898..c6639cd2 100644 --- a/src/reader/slice_reader.rs +++ b/src/reader/slice_reader.rs @@ -43,12 +43,7 @@ impl<'buf> InnerReader for SliceReader<'buf> { /// Private reading functions for a [`SliceReader`]. impl<'buf> SliceReader<'buf> { - fn read_bytes_until( - &mut self, - byte: u8, - _buf: &mut (), - position: &mut usize, - ) -> Result> { + fn read_bytes_until(&mut self, byte: u8, position: &mut usize) -> Result> { if self.is_empty() { return Ok(None); } @@ -68,7 +63,6 @@ impl<'buf> SliceReader<'buf> { fn read_bang_element( &mut self, - _buf: &mut (), position: &mut usize, ) -> Result> { // Peeked one bang ('!') before being called, so it's guaranteed to @@ -88,7 +82,7 @@ impl<'buf> SliceReader<'buf> { Err(bang_type.to_err()) } - fn read_element(&mut self, _buf: &mut (), position: &mut usize) -> Result> { + fn read_element(&mut self, position: &mut usize) -> Result> { if self.is_empty() { return Ok(None); } @@ -138,11 +132,11 @@ impl<'buf> Reader> { /// Read text into the given buffer, and return an event that borrows from /// either that buffer or from the input itself, based on the type of the /// reader. - fn read_event_impl(&mut self, _buf: &mut ()) -> Result> { + fn read_event_impl(&mut self) -> Result> { let event = match self.tag_state { - TagState::Init => self.read_until_open(&mut (), true), - TagState::Closed => self.read_until_open(&mut (), false), - TagState::Opened => self.read_until_close(&mut ()), + TagState::Init => self.read_until_open(true), + TagState::Closed => self.read_until_open(false), + TagState::Opened => self.read_until_close(), TagState::Empty => self.close_expanded_empty(), TagState::Exit => return Ok(Event::Eof), }; @@ -156,7 +150,7 @@ impl<'buf> Reader> { /// Read until '<' is found and moves reader to an `Opened` state. /// /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise - fn read_until_open(&mut self, _buf: &mut (), first: bool) -> Result> { + fn read_until_open(&mut self, first: bool) -> Result> { self.tag_state = TagState::Opened; if self.trim_text_start { @@ -165,13 +159,10 @@ impl<'buf> Reader> { // If we already at the `<` symbol, do not try to return an empty Text event if self.reader.skip_one(b'<', &mut self.buf_position)? { - return self.read_event_impl(&mut ()); + return self.read_event_impl(); } - match self - .reader - .read_bytes_until(b'<', &mut (), &mut self.buf_position) - { + match self.reader.read_bytes_until(b'<', &mut self.buf_position) { Ok(Some(bytes)) => { #[cfg(feature = "encoding")] if first && self.encoding.can_be_refined() { @@ -204,43 +195,30 @@ impl<'buf> Reader> { /// Private function to read until `>` is found. This function expects that /// it was called just after encounter a `<` symbol. - fn read_until_close(&mut self, _buf: &mut ()) -> Result> { + fn read_until_close(&mut self) -> Result> { self.tag_state = TagState::Closed; match self.reader.peek_one() { // ` match self - .reader - .read_bang_element(&mut (), &mut self.buf_position) - { + Ok(Some(b'!')) => match self.reader.read_bang_element(&mut self.buf_position) { Ok(None) => Ok(Event::Eof), Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), Err(e) => Err(e), }, // ` { - match self - .reader - .read_bytes_until(b'>', &mut (), &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_end(bytes), - Err(e) => Err(e), - } - } + Ok(Some(b'/')) => match self.reader.read_bytes_until(b'>', &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, // ` { - match self - .reader - .read_bytes_until(b'>', &mut (), &mut self.buf_position) - { - Ok(None) => Ok(Event::Eof), - Ok(Some(bytes)) => self.read_question_mark(bytes), - Err(e) => Err(e), - } - } + Ok(Some(b'?')) => match self.reader.read_bytes_until(b'>', &mut self.buf_position) { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, // `<...` - opening or self-closed tag - Ok(Some(_)) => match self.reader.read_element(&mut (), &mut self.buf_position) { + Ok(Some(_)) => match self.reader.read_element(&mut self.buf_position) { Ok(None) => Ok(Event::Eof), Ok(Some(bytes)) => self.read_start(bytes), Err(e) => Err(e), @@ -278,14 +256,7 @@ impl<'buf> Reader> { /// Read an event that borrows from the input rather than a buffer. #[inline] pub fn read_event(&mut self) -> Result> { - self.read_event_impl(&mut ()) - } - - /// Temporary helper to keep both `read_event` and `read_event_into` available for reading - /// from `&[u8]`. - #[inline] - pub fn read_event_into(&mut self, _buf: &mut Vec) -> Result> { - self.read_event() + self.read_event_impl() } /// Reads until end element is found. This function is supposed to be called @@ -379,12 +350,6 @@ impl<'buf> Reader> { } } - /// Temporary helper to keep both `read_to_end` and `read_to_end_into` available for reading - /// from `&[u8]`. - pub fn read_to_end_into(&mut self, end: QName, _buf: &mut Vec) -> Result<()> { - self.read_to_end(end) - } - /// Reads optional text between start and end tags. /// /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a @@ -433,12 +398,6 @@ impl<'buf> Reader> { Ok(s) } - /// Temporary helper to keep both `read_text` and `read_text_into` available for reading - /// from `&[u8]`. - pub fn read_text_into(&mut self, end: QName, _buf: &mut Vec) -> Result { - self.read_text(end) - } - /// Reads the next event and resolves its namespace (if applicable). /// /// # Examples @@ -456,11 +415,10 @@ impl<'buf> Reader> { /// let mut reader = Reader::from_str(xml); /// reader.trim_text(true); /// let mut count = 0; - /// let mut buf = Vec::new(); /// let mut ns_buf = Vec::new(); /// let mut txt = Vec::new(); /// loop { - /// match reader.read_namespaced_event(&mut buf, &mut ns_buf) { + /// match reader.read_namespaced_event(&mut ns_buf) { /// Ok((Bound(ns), Event::Start(e))) => { /// count += 1; /// match (ns.as_ref(), e.local_name().as_ref()) { @@ -489,7 +447,6 @@ impl<'buf> Reader> { /// ``` pub fn read_namespaced_event<'ns>( &mut self, - _buf: &mut Vec, namespace_buffer: &'ns mut Vec, ) -> Result<(ResolveResult<'ns>, Event<'buf>)> { if self.pending_pop { @@ -519,5 +476,5 @@ mod test { Reader::from_bytes(s) } - check!(let mut buf = ();); + check!(); } diff --git a/src/writer.rs b/src/writer.rs index 54579808..a89de58a 100644 --- a/src/writer.rs +++ b/src/writer.rs @@ -20,9 +20,8 @@ use std::io::Write; /// let mut reader = Reader::from_str(xml); /// reader.trim_text(true); /// let mut writer = Writer::new(Cursor::new(Vec::new())); -/// let mut buf = Vec::new(); /// loop { -/// match reader.read_event_into(&mut buf) { +/// match reader.read_event() { /// Ok(Event::Start(ref e)) if e.name().as_ref() == b"this_tag" => { /// /// // crates a new element ... alternatively we could reuse `e` by calling @@ -46,7 +45,6 @@ use std::io::Write; /// Ok(e) => assert!(writer.write_event(&e).is_ok()), /// Err(e) => panic!("{}", e), /// } -/// buf.clear(); /// } /// /// let result = writer.into_inner().into_inner(); diff --git a/tests/namespaces.rs b/tests/namespaces.rs index 4729f2c7..911c5328 100644 --- a/tests/namespaces.rs +++ b/tests/namespaces.rs @@ -11,11 +11,10 @@ fn namespace() { let mut r = Reader::from_str("in namespace!"); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting outer start element with no namespace, got {:?}", @@ -24,7 +23,7 @@ fn namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner start element with to resolve to 'www1', got {:?}", @@ -32,13 +31,13 @@ fn namespace() { ), } // "in namespace!" - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { //TODO: Check in specification, it is true that namespace should be empty? Ok((ns, Text(_))) => assert_eq!(ns, Unbound), e => panic!("expecting text content with no namespace, got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner end element with to resolve to 'www1', got {:?}", @@ -47,7 +46,7 @@ fn namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting outer end element with no namespace, got {:?}", e), } @@ -58,11 +57,10 @@ fn default_namespace() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting outer start element with no namespace, got {:?}", @@ -71,7 +69,7 @@ fn default_namespace() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner start element with to resolve to 'www1', got {:?}", @@ -79,7 +77,7 @@ fn default_namespace() { ), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting inner end element with to resolve to 'www1', got {:?}", @@ -89,7 +87,7 @@ fn default_namespace() { // very important: a should not be in any namespace. The default namespace only applies to // the sub-document it is defined on. - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting outer end element with no namespace, got {:?}", e), } @@ -100,11 +98,10 @@ fn default_namespace_reset() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting outer start element with to resolve to 'www1', got {:?}", @@ -113,7 +110,7 @@ fn default_namespace_reset() { } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Start(_))) => assert_eq!(ns, Unbound), e => panic!( "expecting inner start element with no namespace, got {:?}", @@ -121,13 +118,13 @@ fn default_namespace_reset() { ), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Unbound), e => panic!("expecting inner end element with no namespace, got {:?}", e), } // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, End(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "expecting outer end element with to resolve to 'www1', got {:?}", @@ -363,11 +360,10 @@ fn reserved_name() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); let mut ns_buf = Vec::new(); // - match r.read_namespaced_event(&mut buf, &mut ns_buf) { + match r.read_namespaced_event(&mut ns_buf) { Ok((ns, Empty(_))) => assert_eq!(ns, Bound(Namespace(b"www1"))), e => panic!( "Expected empty element bound to namespace 'www1', got {:?}", diff --git a/tests/test.rs b/tests/test.rs index f296d106..dce9aa1e 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -212,20 +212,16 @@ fn test_trim() { fn test_clone_reader() { let mut reader = Reader::from_str("text"); reader.trim_text(true); - let mut buf = Vec::new(); - assert!(matches!( - reader.read_event_into(&mut buf).unwrap(), - Start(_) - )); + assert!(matches!(reader.read_event().unwrap(), Start(_))); let mut cloned = reader.clone(); - assert!(matches!(reader.read_event_into(&mut buf).unwrap(), Text(_))); - assert!(matches!(reader.read_event_into(&mut buf).unwrap(), End(_))); + assert!(matches!(reader.read_event().unwrap(), Text(_))); + assert!(matches!(reader.read_event().unwrap(), End(_))); - assert!(matches!(cloned.read_event_into(&mut buf).unwrap(), Text(_))); - assert!(matches!(cloned.read_event_into(&mut buf).unwrap(), End(_))); + assert!(matches!(cloned.read_event().unwrap(), Text(_))); + assert!(matches!(cloned.read_event().unwrap(), End(_))); } #[cfg(feature = "serialize")] diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 001fce2a..b159906a 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -12,8 +12,7 @@ use pretty_assertions::assert_eq; macro_rules! next_eq_name { ($r:expr, $t:tt, $bytes:expr) => { - let mut buf = Vec::new(); - match $r.read_event_into(&mut buf).unwrap() { + match $r.read_event().unwrap() { $t(ref e) if e.name().as_ref() == $bytes => (), e => panic!( "expecting {}({:?}), found {:?}", @@ -22,14 +21,12 @@ macro_rules! next_eq_name { e ), } - buf.clear(); }; } macro_rules! next_eq_content { ($r:expr, $t:tt, $bytes:expr) => { - let mut buf = Vec::new(); - match $r.read_event_into(&mut buf).unwrap() { + match $r.read_event().unwrap() { $t(ref e) if e.as_ref() == $bytes => (), e => panic!( "expecting {}({:?}), found {:?}", @@ -38,7 +35,6 @@ macro_rules! next_eq_content { e ), } - buf.clear(); }; } @@ -130,8 +126,7 @@ fn test_comment() { fn test_xml_decl() { let mut r = Reader::from_str(""); r.trim_text(true); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf).unwrap() { + match r.read_event().unwrap() { Decl(ref e) => { match e.version() { Ok(v) => assert_eq!( @@ -207,9 +202,8 @@ fn test_writer() -> Result<()> { let mut reader = Reader::from_str(txt); reader.trim_text(true); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, e => assert!(writer.write_event(e).is_ok()), } @@ -226,9 +220,8 @@ fn test_writer_borrow() -> Result<()> { let mut reader = Reader::from_str(txt); reader.trim_text(true); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, e => assert!(writer.write_event(&e).is_ok()), // either `e` or `&e` } @@ -249,9 +242,8 @@ fn test_writer_indent() -> Result<()> { let mut reader = Reader::from_str(txt); reader.trim_text(true); let mut writer = Writer::new_with_indent(Cursor::new(Vec::new()), b' ', 4); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, e => assert!(writer.write_event(e).is_ok()), } @@ -275,9 +267,8 @@ fn test_writer_indent_cdata() -> Result<()> { let mut reader = Reader::from_str(txt); reader.trim_text(true); let mut writer = Writer::new_with_indent(Cursor::new(Vec::new()), b' ', 4); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, e => assert!(writer.write_event(e).is_ok()), } @@ -301,9 +292,8 @@ fn test_write_empty_element_attrs() -> Result<()> { let mut reader = Reader::from_str(str_from); reader.expand_empty_elements(false); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, e => assert!(writer.write_event(e).is_ok()), } @@ -323,9 +313,8 @@ fn test_write_attrs() -> Result<()> { let mut reader = Reader::from_str(str_from); reader.trim_text(true); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - let event = match reader.read_event_into(&mut buf)? { + let event = match reader.read_event()? { Eof => break, Start(elem) => { let mut attrs = elem.attributes().collect::>>()?; @@ -430,8 +419,7 @@ fn test_buf_position_err_end_element() { let mut r = Reader::from_str(""); r.trim_text(true).check_end_names(true); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { Err(_) if r.buffer_position() == 2 => (), // error at char 2: no opening tag Err(e) => panic!( "expecting buf_pos = 2, found {}, err: {:?}", @@ -450,8 +438,7 @@ fn test_buf_position_err_comment() { next_eq!(r, Start, b"a"); assert_eq!(r.buffer_position(), 3); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { // error at char 4: no closing --> tag found Err(e) => assert_eq!( r.buffer_position(), @@ -469,12 +456,10 @@ fn test_buf_position_err_comment_2_buf() { let mut r = Reader::from_str(" tag found Err(e) => assert_eq!( r.buffer_position(), @@ -495,8 +480,7 @@ fn test_buf_position_err_comment_trim_text() { next_eq!(r, Start, b"a"); assert_eq!(r.buffer_position(), 3); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { // error at char 7: no closing --> tag found Err(e) => assert_eq!( r.buffer_position(), @@ -514,8 +498,7 @@ fn test_escaped_content() { let mut r = Reader::from_str("<test>"); r.trim_text(true); next_eq!(r, Start, b"a"); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Text(e)) => { assert_eq!( &*e, @@ -556,9 +539,8 @@ fn test_read_write_roundtrip_results_in_identity() -> Result<()> { let mut reader = Reader::from_str(input); reader.trim_text(false).expand_empty_elements(false); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, e => assert!(writer.write_event(e).is_ok()), } @@ -583,9 +565,8 @@ fn test_read_write_roundtrip() -> Result<()> { let mut reader = Reader::from_str(input); reader.trim_text(false).expand_empty_elements(false); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, e => assert!(writer.write_event(e).is_ok()), } @@ -610,9 +591,8 @@ fn test_read_write_roundtrip_escape() -> Result<()> { let mut reader = Reader::from_str(input); reader.trim_text(false).expand_empty_elements(false); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, Text(e) => { let t = e.escape(); @@ -643,9 +623,8 @@ fn test_read_write_roundtrip_escape_text() -> Result<()> { let mut reader = Reader::from_str(input); reader.trim_text(false).expand_empty_elements(false); let mut writer = Writer::new(Cursor::new(Vec::new())); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf)? { + match reader.read_event()? { Eof => break, Text(e) => { let t = e.decode_and_unescape(&reader).unwrap(); @@ -666,8 +645,7 @@ fn test_read_write_roundtrip_escape_text() -> Result<()> { fn test_closing_bracket_in_single_quote_attr() { let mut r = Reader::from_str(""); r.trim_text(true); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Start(e)) => { let mut attrs = e.attributes(); assert_eq!( @@ -695,8 +673,7 @@ fn test_closing_bracket_in_single_quote_attr() { fn test_closing_bracket_in_double_quote_attr() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Start(e)) => { let mut attrs = e.attributes(); assert_eq!( @@ -724,8 +701,7 @@ fn test_closing_bracket_in_double_quote_attr() { fn test_closing_bracket_in_double_quote_mixed() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Start(e)) => { let mut attrs = e.attributes(); assert_eq!( @@ -753,8 +729,7 @@ fn test_closing_bracket_in_double_quote_mixed() { fn test_closing_bracket_in_single_quote_mixed() { let mut r = Reader::from_str(r#""#); r.trim_text(true); - let mut buf = Vec::new(); - match r.read_event_into(&mut buf) { + match r.read_event() { Ok(Start(e)) => { let mut attrs = e.attributes(); assert_eq!( @@ -791,10 +766,9 @@ mod decode_with_bom_removal { reader.trim_text(true); let mut txt = Vec::new(); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf) { + match reader.read_event() { Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), Ok(Eof) => break, _ => (), @@ -855,10 +829,9 @@ mod decode_with_bom_removal { reader.trim_text(true); let mut txt = Vec::new(); - let mut buf = Vec::new(); loop { - match reader.read_event_into(&mut buf) { + match reader.read_event() { Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), Ok(Eof) => break, _ => (), From 51fd38f1fda531f92b37fba1df9ca3fcd0f4e509 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 11:26:55 +0200 Subject: [PATCH 4/7] Add example for buffered access when reading from a file --- examples/read_buffered.rs | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) create mode 100644 examples/read_buffered.rs diff --git a/examples/read_buffered.rs b/examples/read_buffered.rs new file mode 100644 index 00000000..25b28ee2 --- /dev/null +++ b/examples/read_buffered.rs @@ -0,0 +1,34 @@ +// This example demonstrates how a reader (for example when reading from a file) +// can be buffered. In that case, data read from the file is written to a supplied +// buffer and returned XML events borrow from that buffer. +// That way, allocations can be kept to a minimum. + +fn main() -> Result<(), quick_xml::Error> { + use quick_xml::events::Event; + use quick_xml::Reader; + + let mut reader = Reader::from_file("tests/documents/document.xml")?; + reader.trim_text(true); + + let mut buf = Vec::new(); + + let mut count = 0; + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(ref e)) => { + let name = e.name(); + let name = reader.decoder().decode(name.as_ref())?; + println!("read start event {:?}", name.as_ref()); + count += 1; + } + Ok(Event::Eof) => break, // exits the loop when reaching end of file + Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + _ => (), // There are several other `Event`s we do not consider here + } + } + + println!("read {} start events in total", count); + + Ok(()) +} From 7d492597df91e1cc9fdab712d9c5edf1c45f4afe Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 11:33:18 +0200 Subject: [PATCH 5/7] Change the check! macro to allow for async tests --- src/reader.rs | 318 ++++++++++++++++++++++++++------------------------ 1 file changed, 167 insertions(+), 151 deletions(-) diff --git a/src/reader.rs b/src/reader.rs index fa9c0cdd..baf68b08 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -946,7 +946,12 @@ fn detect_encoding(bytes: &[u8]) -> Option<&'static Encoding> { #[cfg(test)] mod test { macro_rules! check { - ($(let mut $buf:ident = $init:expr;)?) => { + ($(let mut $buf:ident = $init:expr; $($await:tt)?)?) => { + check!(#[test] { + $(let mut $buf = $init; $($await)?)? + }); + }; + (#[$test:meta] $($async:ident)? { $(let mut $buf:ident = $init:expr; $($await:tt)?)? }) => { mod read_bytes_until { use super::input_from_bytes; // Use Bytes for printing bytes as strings for ASCII range @@ -954,8 +959,8 @@ mod test { use pretty_assertions::assert_eq; /// Checks that search in the empty buffer returns `None` - #[test] - fn empty() { + #[$test] + $($async)? fn empty() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"".as_ref()); @@ -964,6 +969,7 @@ mod test { assert_eq!( input .read_bytes_until(b'*', $(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(Bytes), None @@ -973,8 +979,8 @@ mod test { /// Checks that search in the buffer non-existent value returns entire buffer /// as a result and set `position` to `len()` - #[test] - fn non_existent() { + #[$test] + $($async)? fn non_existent() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"abcdef".as_ref()); @@ -983,6 +989,7 @@ mod test { assert_eq!( input .read_bytes_until(b'*', $(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -993,8 +1000,8 @@ mod test { /// Checks that search in the buffer an element that is located in the front of /// buffer returns empty slice as a result and set `position` to one symbol /// after match (`1`) - #[test] - fn at_the_start() { + #[$test] + $($async)? fn at_the_start() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"*abcdef".as_ref()); @@ -1003,6 +1010,7 @@ mod test { assert_eq!( input .read_bytes_until(b'*', $(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(Bytes), Some(Bytes(b"")) @@ -1013,8 +1021,8 @@ mod test { /// Checks that search in the buffer an element that is located in the middle of /// buffer returns slice before that symbol as a result and set `position` to one /// symbol after match - #[test] - fn inside() { + #[$test] + $($async)? fn inside() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"abc*def".as_ref()); @@ -1023,6 +1031,7 @@ mod test { assert_eq!( input .read_bytes_until(b'*', $(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(Bytes), Some(Bytes(b"abc")) @@ -1033,8 +1042,8 @@ mod test { /// Checks that search in the buffer an element that is located in the end of /// buffer returns slice before that symbol as a result and set `position` to one /// symbol after match (`len()`) - #[test] - fn in_the_end() { + #[$test] + $($async)? fn in_the_end() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"abcdef*".as_ref()); @@ -1043,6 +1052,7 @@ mod test { assert_eq!( input .read_bytes_until(b'*', $(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(Bytes), Some(Bytes(b"abcdef")) @@ -1063,15 +1073,15 @@ mod test { /// Checks that if input begins like CDATA element, but CDATA start sequence /// is not finished, parsing ends with an error - #[test] + #[$test] #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"] - fn not_properly_start() { + $($async)? fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"![]]>other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1084,14 +1094,14 @@ mod test { /// Checks that if CDATA startup sequence was matched, but an end sequence /// is not found, parsing ends with an error - #[test] - fn not_closed() { + #[$test] + $($async)? fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"![CDATA[other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1103,8 +1113,8 @@ mod test { } /// Checks that CDATA element without content inside parsed successfully - #[test] - fn empty() { + #[$test] + $($async)? fn empty() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"![CDATA[]]>other content".as_ref()); @@ -1113,6 +1123,7 @@ mod test { assert_eq!( input .read_bang_element($(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA["))) @@ -1123,8 +1134,8 @@ mod test { /// Checks that CDATA element with content parsed successfully. /// Additionally checks that sequences inside CDATA that may look like /// a CDATA end sequence do not interrupt CDATA parsing - #[test] - fn with_content() { + #[$test] + $($async)? fn with_content() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref()); @@ -1133,6 +1144,7 @@ mod test { assert_eq!( input .read_bang_element($(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::CData, Bytes(b"![CDATA[cdata]] ]>content"))) @@ -1164,15 +1176,15 @@ mod test { use crate::utils::Bytes; use pretty_assertions::assert_eq; - #[test] + #[$test] #[ignore = "start comment sequence fully checked outside of `read_bang_element`"] - fn not_properly_start() { + $($async)? fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!- -->other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1183,14 +1195,14 @@ mod test { assert_eq!(position, 0); } - #[test] - fn not_properly_end() { + #[$test] + $($async)? fn not_properly_end() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!->other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1201,14 +1213,14 @@ mod test { assert_eq!(position, 0); } - #[test] - fn not_closed1() { + #[$test] + $($async)? fn not_closed1() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!--other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1219,14 +1231,14 @@ mod test { assert_eq!(position, 0); } - #[test] - fn not_closed2() { + #[$test] + $($async)? fn not_closed2() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!-->other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1237,14 +1249,14 @@ mod test { assert_eq!(position, 0); } - #[test] - fn not_closed3() { + #[$test] + $($async)? fn not_closed3() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!--->other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1255,8 +1267,8 @@ mod test { assert_eq!(position, 0); } - #[test] - fn empty() { + #[$test] + $($async)? fn empty() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!---->other content".as_ref()); @@ -1265,6 +1277,7 @@ mod test { assert_eq!( input .read_bang_element($(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!----"))) @@ -1272,8 +1285,8 @@ mod test { assert_eq!(position, 6); } - #[test] - fn with_content() { + #[$test] + $($async)? fn with_content() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!--->comment<--->other content".as_ref()); @@ -1282,6 +1295,7 @@ mod test { assert_eq!( input .read_bang_element($(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::Comment, Bytes(b"!--->comment<---"))) @@ -1300,14 +1314,14 @@ mod test { use crate::utils::Bytes; use pretty_assertions::assert_eq; - #[test] - fn not_properly_start() { + #[$test] + $($async)? fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!D other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1318,14 +1332,14 @@ mod test { assert_eq!(position, 0); } - #[test] - fn without_space() { + #[$test] + $($async)? fn without_space() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!DOCTYPEother content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1336,8 +1350,8 @@ mod test { assert_eq!(position, 0); } - #[test] - fn empty() { + #[$test] + $($async)? fn empty() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!DOCTYPE>other content".as_ref()); @@ -1346,6 +1360,7 @@ mod test { assert_eq!( input .read_bang_element($(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!DOCTYPE"))) @@ -1353,14 +1368,14 @@ mod test { assert_eq!(position, 9); } - #[test] - fn not_closed() { + #[$test] + $($async)? fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!DOCTYPE other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1379,14 +1394,14 @@ mod test { use crate::utils::Bytes; use pretty_assertions::assert_eq; - #[test] - fn not_properly_start() { + #[$test] + $($async)? fn not_properly_start() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!d other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1397,14 +1412,14 @@ mod test { assert_eq!(position, 0); } - #[test] - fn without_space() { + #[$test] + $($async)? fn without_space() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!doctypeother content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1415,8 +1430,8 @@ mod test { assert_eq!(position, 0); } - #[test] - fn empty() { + #[$test] + $($async)? fn empty() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!doctype>other content".as_ref()); @@ -1425,6 +1440,7 @@ mod test { assert_eq!( input .read_bang_element($(&mut $buf, )? &mut position) + $($(.$await)?)? .unwrap() .map(|(ty, data)| (ty, Bytes(data))), Some((BangType::DocType, Bytes(b"!doctype"))) @@ -1432,14 +1448,14 @@ mod test { assert_eq!(position, 9); } - #[test] - fn not_closed() { + #[$test] + $($async)? fn not_closed() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"!doctype other content".as_ref()); // ^= 0 - match input.read_bang_element($(&mut $buf, )? &mut position) { + match input.read_bang_element($(&mut $buf, )? &mut position)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1459,14 +1475,14 @@ mod test { use pretty_assertions::assert_eq; /// Checks that nothing was read from empty buffer - #[test] - fn empty() { + #[$test] + $($async)? fn empty() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"".as_ref()); // ^= 0 - assert_eq!(input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), None); + assert_eq!(input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), None); assert_eq!(position, 0); } @@ -1475,71 +1491,71 @@ mod test { use crate::utils::Bytes; use pretty_assertions::assert_eq; - #[test] - fn empty_tag() { + #[$test] + $($async)? fn empty_tag() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b">".as_ref()); // ^= 1 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b"")) ); assert_eq!(position, 1); } - #[test] - fn normal() { + #[$test] + $($async)? fn normal() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"tag>".as_ref()); // ^= 4 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b"tag")) ); assert_eq!(position, 4); } - #[test] - fn empty_ns_empty_tag() { + #[$test] + $($async)? fn empty_ns_empty_tag() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b":>".as_ref()); // ^= 2 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b":")) ); assert_eq!(position, 2); } - #[test] - fn empty_ns() { + #[$test] + $($async)? fn empty_ns() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b":tag>".as_ref()); // ^= 5 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b":tag")) ); assert_eq!(position, 5); } - #[test] - fn with_attributes() { + #[$test] + $($async)? fn with_attributes() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref()); // ^= 38 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)) ); assert_eq!(position, 38); @@ -1551,71 +1567,71 @@ mod test { use crate::utils::Bytes; use pretty_assertions::assert_eq; - #[test] - fn empty_tag() { + #[$test] + $($async)? fn empty_tag() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"/>".as_ref()); // ^= 2 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b"/")) ); assert_eq!(position, 2); } - #[test] - fn normal() { + #[$test] + $($async)? fn normal() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b"tag/>".as_ref()); // ^= 5 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b"tag/")) ); assert_eq!(position, 5); } - #[test] - fn empty_ns_empty_tag() { + #[$test] + $($async)? fn empty_ns_empty_tag() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b":/>".as_ref()); // ^= 3 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b":/")) ); assert_eq!(position, 3); } - #[test] - fn empty_ns() { + #[$test] + $($async)? fn empty_ns() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(b":tag/>".as_ref()); // ^= 6 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(b":tag/")) ); assert_eq!(position, 6); } - #[test] - fn with_attributes() { + #[$test] + $($async)? fn with_attributes() { $(let mut $buf = $init;)? let mut position = 0; let mut input = input_from_bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref()); // ^= 41 assert_eq!( - input.read_element($(&mut $buf, )? &mut position).unwrap().map(Bytes), + input.read_element($(&mut $buf, )? &mut position)$($(.$await)?)?.unwrap().map(Bytes), Some(Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)) ); assert_eq!(position, 41); @@ -1627,13 +1643,13 @@ mod test { use super::reader_from_str; use crate::errors::Error; - #[test] - fn cdata() { + #[$test] + $($async)? fn cdata() { let doc = "![]]>"; let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? - match reader.read_until_close($(&mut $buf)?) { + match reader.read_until_close($(&mut $buf)?)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "CData" => {} x => assert!( false, @@ -1643,13 +1659,13 @@ mod test { } } - #[test] - fn comment() { + #[$test] + $($async)? fn comment() { let doc = "!- -->"; let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? - match reader.read_until_close($(&mut $buf)?) { + match reader.read_until_close($(&mut $buf)?)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "Comment" => {} x => assert!( false, @@ -1659,13 +1675,13 @@ mod test { } } - #[test] - fn doctype_uppercase() { + #[$test] + $($async)? fn doctype_uppercase() { let doc = "!D>"; let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? - match reader.read_until_close($(&mut $buf)?) { + match reader.read_until_close($(&mut $buf)?)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1675,13 +1691,13 @@ mod test { } } - #[test] - fn doctype_lowercase() { + #[$test] + $($async)? fn doctype_lowercase() { let doc = "!d>"; let mut reader = reader_from_str(doc); $(let mut $buf = $init;)? - match reader.read_until_close($(&mut $buf)?) { + match reader.read_until_close($(&mut $buf)?)$($(.$await)?)? { Err(Error::UnexpectedEof(s)) if s == "DOCTYPE" => {} x => assert!( false, @@ -1698,63 +1714,63 @@ mod test { use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event}; use pretty_assertions::assert_eq; - #[test] - fn start_text() { + #[$test] + $($async)? fn start_text() { let mut reader = reader_from_str("bom"); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::StartText(BytesText::from_escaped_str("bom").into()) ); } - #[test] - fn declaration() { + #[$test] + $($async)? fn declaration() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Decl(BytesDecl::from_start(BytesStart::borrowed(b"xml ", 3))) ); } - #[test] - fn doctype() { + #[$test] + $($async)? fn doctype() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::DocType(BytesText::from_escaped_str("x")) ); } - #[test] - fn processing_instruction() { + #[$test] + $($async)? fn processing_instruction() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::PI(BytesText::from_escaped_str("xml-stylesheet")) ); } - #[test] - fn start() { + #[$test] + $($async)? fn start() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Start(BytesStart::borrowed_name(b"tag")) ); } - #[test] - fn end() { + #[$test] + $($async)? fn end() { let mut reader = reader_from_str(""); // Because we expect invalid XML, do not check that // the end name paired with the start name @@ -1762,68 +1778,68 @@ mod test { $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::End(BytesEnd::borrowed(b"tag")) ); } - #[test] - fn empty() { + #[$test] + $($async)? fn empty() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); } /// Text event cannot be generated without preceding event of another type - #[test] - fn text() { + #[$test] + $($async)? fn text() { let mut reader = reader_from_str("text"); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Empty(BytesStart::borrowed_name(b"tag")) ); assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Text(BytesText::from_escaped_str("text")) ); } - #[test] - fn cdata() { + #[$test] + $($async)? fn cdata() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::CData(BytesCData::from_str("")) ); } - #[test] - fn comment() { + #[$test] + $($async)? fn comment() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Comment(BytesText::from_escaped_str("")) ); } - #[test] - fn eof() { + #[$test] + $($async)? fn eof() { let mut reader = reader_from_str(""); $(let mut $buf = $init;)? assert_eq!( - reader.read_event_impl($(&mut $buf)?).unwrap(), + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Eof ); } @@ -1841,35 +1857,35 @@ mod test { use pretty_assertions::assert_eq; /// Checks that encoding is detected by BOM and changed after XML declaration - #[test] - fn bom_detected() { + #[$test] + $($async)? fn bom_detected() { let mut reader = reader_from_bytes(b"\xFF\xFE"); $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($(&mut $buf)?).unwrap(); + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - reader.read_event_impl($(&mut $buf)?).unwrap(); + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(); assert_eq!(reader.decoder().encoding(), WINDOWS_1251); - assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Eof); } /// Checks that encoding is changed by XML declaration, but only once - #[test] - fn xml_declaration() { + #[$test] + $($async)? fn xml_declaration() { let mut reader = reader_from_bytes(b""); $(let mut $buf = $init;)? assert_eq!(reader.decoder().encoding(), UTF_8); - reader.read_event_impl($(&mut $buf)?).unwrap(); + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - reader.read_event_impl($(&mut $buf)?).unwrap(); + reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(); assert_eq!(reader.decoder().encoding(), UTF_16LE); - assert_eq!(reader.read_event_impl($(&mut $buf)?).unwrap(), Event::Eof); + assert_eq!(reader.read_event_impl($(&mut $buf)?)$($(.$await)?)?.unwrap(), Event::Eof); } } } From c9365350dd83116cd80b6ba86ee0a1c7d22fecd1 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Thu, 14 Jul 2022 12:48:26 +0200 Subject: [PATCH 6/7] Implement async reader --- Cargo.toml | 17 + src/lib.rs | 2 + src/reader.rs | 4 + src/reader/async_reader.rs | 698 +++++++++++++++++++++++++++++++++++++ 4 files changed, 721 insertions(+) create mode 100644 src/reader/async_reader.rs diff --git a/Cargo.toml b/Cargo.toml index e13202e2..c88d0aaa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,8 @@ document-features = { version = "0.2", optional = true } encoding_rs = { version = "0.8", optional = true } serde = { version = "1.0", optional = true } memchr = "2.5" +tokio = { version = "1.19", optional = true, default-features = false, features = ["io-util"] } +async-recursion = { version = "1.0", optional = true } [dev-dependencies] criterion = "0.3" @@ -23,6 +25,8 @@ pretty_assertions = "1.2" regex = "1" serde = { version = "1.0", features = ["derive"] } serde-value = "0.7" +tokio = { version = "1.20", default-features = false, features = ["macros", "rt-multi-thread"] } +tokio-test = "0.4" [lib] bench = false @@ -101,6 +105,19 @@ serialize = ["serde"] ## Enables support for recognizing all [HTML 5 entities](https://dev.w3.org/html5/html-author/charref) escape-html = [] +## Enables support for asynchronous reading from `tokio`'s IO-Traits. +## +## This can be used for example with `Reader::from_async_reader(read)` where `read` +## is some type implementing `tokio::io::AsyncBufRead`. +async = ["tokio", "async-recursion"] + +## Enables support for asynchronous reading from files using `tokio`. This feature +## also automatically enables the `async` feature as well. +## +## This can be used for example with `Reader::from_file_async(path)` where `path` +## is a file path. +async-fs = ["async", "tokio/fs"] + [package.metadata.docs.rs] all-features = true diff --git a/src/lib.rs b/src/lib.rs index 26436786..be8903c0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -158,5 +158,7 @@ mod writer; #[cfg(feature = "serialize")] pub use crate::errors::serialize::DeError; pub use crate::errors::{Error, Result}; +#[cfg(feature = "async")] +pub use crate::reader::AsyncReader; pub use crate::reader::{Decoder, IoReader, Reader, SliceReader}; pub use crate::writer::{ElementWriter, Writer}; diff --git a/src/reader.rs b/src/reader.rs index baf68b08..ce14a08d 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -13,9 +13,13 @@ use crate::name::{LocalName, NamespaceResolver, QName, ResolveResult}; use memchr; +#[cfg(feature = "async")] +mod async_reader; mod io_reader; mod slice_reader; +#[cfg(feature = "async")] +pub use self::async_reader::AsyncReader; pub use self::io_reader::IoReader; pub use self::slice_reader::SliceReader; diff --git a/src/reader/async_reader.rs b/src/reader/async_reader.rs new file mode 100644 index 00000000..30ea2580 --- /dev/null +++ b/src/reader/async_reader.rs @@ -0,0 +1,698 @@ +//! This is an implementation of [`Reader`] for reading from a [`AsyncRead`] or [`AsyncBufRead`] +//! as underlying byte stream. This reader fully implements async/await so reading can use +//! non-blocking I/O. + +use std::ops::{Deref, DerefMut}; +use std::path::Path; + +use async_recursion::async_recursion; +#[cfg(feature = "async-fs")] +use tokio::fs::File; +use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, BufReader}; + +use crate::events::{BytesText, Event}; +use crate::name::{QName, ResolveResult}; +use crate::{Error, Result}; + +#[cfg(feature = "encoding")] +use super::{detect_encoding, EncodingRef}; +use super::{is_whitespace, BangType, InnerReader, ReadElementState, Reader, TagState}; + +/// A struct for handling reading functions based on reading from a [`BufRead`]. +#[derive(Debug, Clone)] +pub struct AsyncReader(R); + +impl Deref for AsyncReader { + type Target = R; + + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl DerefMut for AsyncReader { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.0 + } +} + +impl InnerReader for AsyncReader { + type Reader = R; + + fn into_inner(self) -> Self::Reader { + self.0 + } +} + +/// Private reading functions. +impl AsyncReader { + #[inline] + async fn read_bytes_until<'buf>( + &mut self, + byte: u8, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + let mut read = 0; + let mut done = false; + let start = buf.len(); + while !done { + let used = { + let available = match self.fill_buf().await { + Ok(n) if n.is_empty() => break, + Ok(n) => n, + Err(e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + + match memchr::memchr(byte, available) { + Some(i) => { + buf.extend_from_slice(&available[..i]); + done = true; + i + 1 + } + None => { + buf.extend_from_slice(available); + available.len() + } + } + }; + self.consume(used); + read += used; + } + *position += read; + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + async fn read_bang_element<'buf>( + &mut self, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + // Peeked one bang ('!') before being called, so it's guaranteed to + // start with it. + let start = buf.len(); + let mut read = 1; + buf.push(b'!'); + self.consume(1); + + let bang_type = BangType::new(self.peek_one().await?)?; + + loop { + match self.fill_buf().await { + // Note: Do not update position, so the error points to + // somewhere sane rather than at the EOF + Ok(n) if n.is_empty() => return Err(bang_type.to_err()), + Ok(available) => { + if let Some((consumed, used)) = bang_type.parse(available, read) { + buf.extend_from_slice(consumed); + + self.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + } + } + + if read == 0 { + Ok(None) + } else { + Ok(Some((bang_type, &buf[start..]))) + } + } + + #[inline] + async fn read_element<'buf>( + &mut self, + buf: &'buf mut Vec, + position: &mut usize, + ) -> Result> { + let mut state = ReadElementState::Elem; + let mut read = 0; + + let start = buf.len(); + loop { + match self.fill_buf().await { + Ok(n) if n.is_empty() => break, + Ok(available) => { + if let Some((consumed, used)) = state.change(available) { + buf.extend_from_slice(consumed); + + self.consume(used); + read += used; + + *position += read; + break; + } else { + buf.extend_from_slice(available); + + let used = available.len(); + self.consume(used); + read += used; + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => { + *position += read; + return Err(Error::Io(e)); + } + }; + } + + if read == 0 { + Ok(None) + } else { + Ok(Some(&buf[start..])) + } + } + + /// Consume and discard all the whitespace until the next non-whitespace + /// character or EOF. + async fn skip_whitespace(&mut self, position: &mut usize) -> Result<()> { + loop { + break match self.fill_buf().await { + Ok(n) => { + let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len()); + if count > 0 { + self.consume(count); + *position += count; + continue; + } else { + Ok(()) + } + } + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } + + /// Consume and discard one character if it matches the given byte. Return + /// true if it matched. + async fn skip_one(&mut self, byte: u8, position: &mut usize) -> Result { + match self.peek_one().await? { + Some(b) if b == byte => { + *position += 1; + self.consume(1); + Ok(true) + } + _ => Ok(false), + } + } + + /// Return one character without consuming it, so that future `read_*` calls + /// will still include it. On EOF, return None. + async fn peek_one(&mut self) -> Result> { + loop { + break match self.fill_buf().await { + Ok(n) if n.is_empty() => Ok(None), + Ok(n) => Ok(Some(n[0])), + Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue, + Err(e) => Err(Error::Io(e)), + }; + } + } +} + +/// Private functions for a [`Reader`] based on an [`AsyncReader`]. +impl Reader> { + /// Read text into the given buffer, and return an event that borrows from + /// either that buffer or from the input itself, based on the type of the + /// reader. + #[async_recursion] + async fn read_event_impl<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + let event = match self.tag_state { + TagState::Init => self.read_until_open(buf, true).await, + TagState::Closed => self.read_until_open(buf, false).await, + TagState::Opened => self.read_until_close(buf).await, + TagState::Empty => self.close_expanded_empty(), + TagState::Exit => return Ok(Event::Eof), + }; + match event { + Err(_) | Ok(Event::Eof) => self.tag_state = TagState::Exit, + _ => {} + } + event + } + + /// Read until '<' is found and moves reader to an `Opened` state. + /// + /// Return a `StartText` event if `first` is `true` and a `Text` event otherwise + async fn read_until_open<'buf>( + &mut self, + buf: &'buf mut Vec, + first: bool, + ) -> Result> { + self.tag_state = TagState::Opened; + + if self.trim_text_start { + self.reader.skip_whitespace(&mut self.buf_position).await?; + } + + // If we already at the `<` symbol, do not try to return an empty Text event + if self.reader.skip_one(b'<', &mut self.buf_position).await? { + return self.read_event_impl(buf).await; + } + + match self + .reader + .read_bytes_until(b'<', buf, &mut self.buf_position) + .await + { + Ok(Some(bytes)) => { + #[cfg(feature = "encoding")] + if first && self.encoding.can_be_refined() { + if let Some(encoding) = detect_encoding(bytes) { + self.encoding = EncodingRef::BomDetected(encoding); + } + } + + let content = if self.trim_text_end { + // Skip the ending '< + let len = bytes + .iter() + .rposition(|&b| !is_whitespace(b)) + .map_or_else(|| bytes.len(), |p| p + 1); + &bytes[..len] + } else { + bytes + }; + + Ok(if first { + Event::StartText(BytesText::from_escaped(content).into()) + } else { + Event::Text(BytesText::from_escaped(content)) + }) + } + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } + + /// Private function to read until `>` is found. This function expects that + /// it was called just after encounter a `<` symbol. + async fn read_until_close<'buf>(&mut self, buf: &'buf mut Vec) -> Result> { + self.tag_state = TagState::Closed; + + match self.reader.peek_one().await { + // ` match self + .reader + .read_bang_element(buf, &mut self.buf_position) + .await + { + Ok(None) => Ok(Event::Eof), + Ok(Some((bang_type, bytes))) => self.read_bang(bang_type, bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + .await + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_end(bytes), + Err(e) => Err(e), + }, + // ` match self + .reader + .read_bytes_until(b'>', buf, &mut self.buf_position) + .await + { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_question_mark(bytes), + Err(e) => Err(e), + }, + // `<...` - opening or self-closed tag + Ok(Some(_)) => match self.reader.read_element(buf, &mut self.buf_position).await { + Ok(None) => Ok(Event::Eof), + Ok(Some(bytes)) => self.read_start(bytes), + Err(e) => Err(e), + }, + Ok(None) => Ok(Event::Eof), + Err(e) => Err(e), + } + } +} + +/// Builder for reading from a file. Gated behind the `async-fs` feature. +#[cfg(feature = "async-fs")] +impl Reader>> { + /// Creates an XML reader from a file path. + pub async fn from_file_async>(path: P) -> Result { + let file = File::open(path).await.map_err(Error::Io)?; + let reader = BufReader::new(file); + Ok(Self::from_reader_internal(AsyncReader(reader))) + } +} + +/// Builder for reading from any [`BufRead`]. +impl Reader> { + /// Creates an XML reader from any type implementing [`Read`]. + pub fn from_async_reader(reader: R) -> Self { + Self::from_reader_internal(AsyncReader(reader)) + } +} + +/// Builder for reading from any [`Read`]. +impl Reader>> { + /// Creates an XML reader from any type implementing [`Read`]. + pub fn from_async_unbuffered_reader(reader: R) -> Self { + Self::from_reader_internal(AsyncReader(BufReader::new(reader))) + } +} + +/// Public reading methods for a [`Reader`] based on an [`AsyncReader`]. +impl Reader> { + /// Reads the next `Event` asynchronously. + /// + /// This is the main entry point for reading XML `Event`s when using an async reader. + /// + /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow` + /// internally). + /// + /// Having the possibility to control the internal buffers gives you some additional benefits + /// such as: + /// + /// - Reduce the number of allocations by reusing the same buffer. For constrained systems, + /// you can call `buf.clear()` once you are done with processing the event (typically at the + /// end of your loop). + /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`). + /// + /// # Examples + /// + /// ``` + /// # tokio_test::block_on(async move { + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// // This explicitly uses `from_reader(xml.as_bytes())` to use a buffered reader instead of + /// // relying on the zero-copy optimizations for reading from byte slices. + /// let mut reader = Reader::from_async_reader(xml.as_bytes()); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_event_into_async(&mut buf).await { + /// Ok(Event::Start(_)) => count += 1, + /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok(Event::Eof) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// # }); + /// ``` + #[inline] + pub async fn read_event_into_async<'buf>( + &mut self, + buf: &'buf mut Vec, + ) -> Result> { + self.read_event_impl(buf).await + } + + /// Reads asynchronously until end element is found using provided buffer as + /// intermediate storage for events content. This function is supposed to be + /// called after you already read a [`Start`] event. + /// + /// Manages nested cases where parent and child elements have the same name. + /// + /// If corresponding [`End`] event will not be found, the [`Error::UnexpectedEof`] + /// will be returned. In particularly, that error will be returned if you call + /// this method without consuming the corresponding [`Start`] event first. + /// + /// If your reader created from a string slice or byte array slice, it is + /// better to use [`read_to_end()`] method, because it will not copy bytes + /// into intermediate buffer. + /// + /// The provided `buf` buffer will be filled only by one event content at time. + /// Before reading of each event the buffer will be cleared. If you know an + /// appropriate size of each event, you can preallocate the buffer to reduce + /// number of reallocations. + /// + /// The `end` parameter should contain name of the end element _in the reader + /// encoding_. It is good practice to always get that parameter using + /// [`BytesStart::to_end()`] method. + /// + /// The correctness of the skipped events does not checked, if you disabled + /// the [`check_end_names`] option. + /// + /// # Namespaces + /// + /// While the [`Reader`] does not support namespace resolution, namespaces + /// does not change the algorithm for comparing names. Although the names + /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the + /// same namespace, are semantically equivalent, `` cannot close + /// ``, because according to [the specification] + /// + /// > The end of every element that begins with a **start-tag** MUST be marked + /// > by an **end-tag** containing a name that echoes the element's type as + /// > given in the **start-tag** + /// + /// # Examples + /// + /// This example shows, how you can skip XML content after you read the + /// start event. + /// + /// ``` + /// # tokio_test::block_on(async move { + /// # use pretty_assertions::assert_eq; + /// use quick_xml::events::{BytesStart, Event}; + /// use quick_xml::Reader; + /// + /// let mut reader = Reader::from_async_reader(r#" + /// + /// + /// + /// + /// + /// + /// + /// + /// "#.as_bytes()); + /// reader.trim_text(true); + /// let mut buf = Vec::new(); + /// + /// let start = BytesStart::borrowed_name(b"outer"); + /// let end = start.to_end().into_owned(); + /// + /// // First, we read a start event... + /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Start(start)); + /// + /// //...then, we could skip all events to the corresponding end event. + /// // This call will correctly handle nested elements. + /// // Note, however, that this method does not handle namespaces. + /// reader.read_to_end_into_async(end.name(), &mut buf).await.unwrap(); + /// + /// // At the end we should get an Eof event, because we ate the whole XML + /// assert_eq!(reader.read_event_into_async(&mut buf).await.unwrap(), Event::Eof); + /// # }); + /// ``` + /// + /// [`Start`]: Event::Start + /// [`End`]: Event::End + /// [`read_to_end()`]: Self::read_to_end + /// [`check_end_names`]: Self::check_end_names + /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag + pub async fn read_to_end_into_async<'_self, 'buf, 'name>( + &'_self mut self, + end: QName<'name>, + buf: &'buf mut Vec, + ) -> Result<()> { + let mut depth = 0; + loop { + buf.clear(); + match self.read_event_into_async(buf).await { + Err(e) => return Err(e), + + Ok(Event::Start(e)) if e.name() == end => depth += 1, + Ok(Event::End(e)) if e.name() == end => { + if depth == 0 { + return Ok(()); + } + depth -= 1; + } + Ok(Event::Eof) => { + let name = self.decoder().decode(end.as_ref()); + return Err(Error::UnexpectedEof(format!("", name))); + } + _ => (), + } + } + } + + /// Reads optional text between start and end tags asnychronously. + /// + /// If the next event is a [`Text`] event, returns the decoded and unescaped content as a + /// `String`. If the next event is an [`End`] event, returns the empty string. In all other + /// cases, returns an error. + /// + /// Any text will be decoded using the XML encoding specified in the XML declaration (or UTF-8 + /// if none is specified). + /// + /// # Examples + /// + /// ``` + /// # tokio_test::block_on(async move { + /// # use pretty_assertions::assert_eq; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// + /// let mut xml = Reader::from_async_reader(b" + /// <b> + /// + /// " as &[u8]); + /// xml.trim_text(true); + /// let mut buf = Vec::new(); + /// + /// let expected = ["", ""]; + /// for &content in expected.iter() { + /// match xml.read_event_into_async(&mut buf).await { + /// Ok(Event::Start(ref e)) => { + /// assert_eq!(&xml.read_text_into_async(e.name(), &mut Vec::new()).await.unwrap(), content); + /// }, + /// e => panic!("Expecting Start event, found {:?}", e), + /// } + /// buf.clear(); + /// } + /// # }); + /// ``` + /// + /// [`Text`]: Event::Text + /// [`End`]: Event::End + pub async fn read_text_into_async<'_self, 'name, 'buf>( + &'_self mut self, + end: QName<'name>, + buf: &'buf mut Vec, + ) -> Result { + let s = match self.read_event_into_async(buf).await { + Err(e) => return Err(e), + + Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), + Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), + _ => return Err(Error::TextNotFound), + }; + self.read_to_end_into_async(end, buf).await?; + Ok(s) + } + + /// Reads the next event and resolves its namespace (if applicable) asynchronously. + /// + /// # Examples + /// + /// ``` + /// # tokio_test::block_on(async move { + /// use std::str::from_utf8; + /// use quick_xml::Reader; + /// use quick_xml::events::Event; + /// use quick_xml::name::ResolveResult::*; + /// + /// let xml = r#" + /// Test + /// Test 2 + /// "#; + /// let mut reader = Reader::from_async_reader(xml.as_bytes()); + /// reader.trim_text(true); + /// let mut count = 0; + /// let mut buf = Vec::new(); + /// let mut ns_buf = Vec::new(); + /// let mut txt = Vec::new(); + /// loop { + /// match reader.read_namespaced_event_async(&mut buf, &mut ns_buf).await { + /// Ok((Bound(ns), Event::Start(e))) => { + /// count += 1; + /// match (ns.as_ref(), e.local_name().as_ref()) { + /// (b"www.xxxx", b"tag1") => (), + /// (b"www.yyyy", b"tag2") => (), + /// (ns, n) => panic!("Namespace and local name mismatch"), + /// } + /// println!("Resolved namespace: {:?}", ns); + /// } + /// Ok((Unbound, Event::Start(_))) => { + /// panic!("Element not in any namespace") + /// }, + /// Ok((Unknown(p), Event::Start(_))) => { + /// panic!("Undeclared namespace prefix {:?}", String::from_utf8(p)) + /// } + /// Ok((_, Event::Text(e))) => { + /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// }, + /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), + /// Ok((_, Event::Eof)) => break, + /// _ => (), + /// } + /// buf.clear(); + /// } + /// println!("Found {} start events", count); + /// println!("Text events: {:?}", txt); + /// # }); + /// ``` + pub async fn read_namespaced_event_async<'b, 'ns>( + &mut self, + buf: &'b mut Vec, + namespace_buffer: &'ns mut Vec, + ) -> Result<(ResolveResult<'ns>, Event<'b>)> { + if self.pending_pop { + self.ns_resolver.pop(namespace_buffer); + } + self.pending_pop = false; + let event = self.read_event_into_async(buf).await; + self.resolve_namespaced_event_inner(event, namespace_buffer) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::reader::test::check; + + fn input_from_bytes(bytes: &[u8]) -> AsyncReader<&[u8]> { + AsyncReader(bytes) + } + + fn reader_from_str(s: &str) -> Reader> { + Reader::from_async_reader(s.as_bytes()) + } + + #[allow(dead_code)] + fn reader_from_bytes(s: &[u8]) -> Reader> { + Reader::from_async_reader(s) + } + + check!(#[tokio::test] async { + let mut buf = Vec::new(); await + }); +} From a1c9a0dcddab3f91b6adf35b9e37da0f1ab87d94 Mon Sep 17 00:00:00 2001 From: Sophie Tauchert Date: Wed, 13 Jul 2022 17:14:19 +0200 Subject: [PATCH 7/7] Add test for async --- Cargo.toml | 4 ++++ tests/async_test.rs | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100644 tests/async_test.rs diff --git a/Cargo.toml b/Cargo.toml index c88d0aaa..46e66ea5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -136,3 +136,7 @@ required-features = ["serialize"] [[test]] name = "serde-migrated" required-features = ["serialize"] + +[[test]] +name = "async_test" +required-features = ["async"] diff --git a/tests/async_test.rs b/tests/async_test.rs new file mode 100644 index 00000000..031bb145 --- /dev/null +++ b/tests/async_test.rs @@ -0,0 +1,43 @@ +use std::path::PathBuf; + +use quick_xml::events::Event::*; +use quick_xml::Reader; + +#[tokio::test] +async fn test_sample() { + let src: &[u8] = include_bytes!("documents/sample_rss.xml"); + let mut reader = Reader::from_async_reader(src); + let mut buf = Vec::new(); + let mut count = 0; + loop { + match reader.read_event_into_async(&mut buf).await.unwrap() { + Start(_) => count += 1, + Decl(e) => println!("{:?}", e.version()), + Eof => break, + _ => (), + } + buf.clear(); + } + println!("{}", count); +} + +#[cfg(feature = "async-fs")] +#[tokio::test] +async fn test_read_file() { + let path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + let mut reader = Reader::from_file_async(path.join("tests/documents/sample_rss.xml")) + .await + .unwrap(); + let mut buf = Vec::new(); + let mut count = 0; + loop { + match reader.read_event_into_async(&mut buf).await.unwrap() { + Start(_) => count += 1, + Decl(e) => println!("{:?}", e.version()), + Eof => break, + _ => (), + } + buf.clear(); + } + println!("{}", count); +}