From 5467b6c500920d7d250b7d5698c8fb18a260de81 Mon Sep 17 00:00:00 2001 From: Mingun Date: Sun, 17 Jul 2022 17:10:39 +0500 Subject: [PATCH] Use internal decoder of event instead of supplied one in parameters --- Changelog.md | 2 + README.md | 2 +- benches/macrobenches.rs | 2 +- benches/microbenches.rs | 2 +- examples/custom_entities.rs | 4 +- src/de/mod.rs | 9 ++-- src/events/mod.rs | 77 +++++++++++------------------------ src/reader/buffered_reader.rs | 4 +- src/reader/mod.rs | 2 +- src/reader/ns_reader.rs | 8 ++-- tests/test.rs | 4 +- tests/unit_tests.rs | 12 +++--- 12 files changed, 48 insertions(+), 80 deletions(-) diff --git a/Changelog.md b/Changelog.md index e7faa223..f3e879f0 100644 --- a/Changelog.md +++ b/Changelog.md @@ -156,6 +156,8 @@ because writer anyway works in UTF-8 only - [#428]: Changed the event and `Attributes` constructors to accept a `&str` slices instead of `&[u8]` slices. Handmade events has always been assumed to store their content UTF-8 encoded. +- [#428]: Removed `Decoder` parameter from `_and_decode` versions of functions for + `BytesText` (remember, that those functions was renamed in #415). ### New Tests diff --git a/README.md b/README.md index 662ae0ce..12451f32 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ loop { _ => (), } } - Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), + Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), // There are several other `Event`s we do not consider here _ => (), diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 3358f3a4..72a1c0b5 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -28,7 +28,7 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> { } } Event::Text(e) => { - criterion::black_box(e.decode_and_unescape(&r)?); + criterion::black_box(e.unescape()?); } Event::CData(e) => { criterion::black_box(e.into_inner()); diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 1e7cc232..95568224 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -174,7 +174,7 @@ fn one_event(c: &mut Criterion) { .check_comments(false) .trim_text(true); match r.read_event_into(&mut buf) { - Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(), + Ok(Event::Comment(e)) => nbtxt += e.unescape().unwrap().len(), something_else => panic!("Did not expect {:?}", something_else), }; diff --git a/examples/custom_entities.rs b/examples/custom_entities.rs index 3c31d4d1..16f03482 100644 --- a/examples/custom_entities.rs +++ b/examples/custom_entities.rs @@ -60,9 +60,7 @@ fn main() -> Result<(), Box> { Ok(Event::Text(ref e)) => { println!( "text value: {}", - e.decode_and_unescape_with(&reader, |ent| custom_entities - .get(ent) - .map(|s| s.as_str())) + e.unescape_with(|ent| custom_entities.get(ent).map(|s| s.as_str())) .unwrap() ); } diff --git a/src/de/mod.rs b/src/de/mod.rs index d1e95665..31b717c3 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -612,16 +612,15 @@ where unescape: bool, allow_start: bool, ) -> Result, DeError> { - let decoder = self.reader.decoder(); match self.next()? { - DeEvent::Text(e) => Ok(e.decode(decoder, unescape)?), - DeEvent::CData(e) => Ok(e.decode(decoder)?), + DeEvent::Text(e) => Ok(e.decode(unescape)?), + DeEvent::CData(e) => Ok(e.decode()?), DeEvent::Start(e) if allow_start => { // allow one nested level let inner = self.next()?; let t = match inner { - DeEvent::Text(t) => t.decode(decoder, unescape)?, - DeEvent::CData(t) => t.decode(decoder)?, + DeEvent::Text(t) => t.decode(unescape)?, + DeEvent::CData(t) => t.decode()?, DeEvent::Start(s) => { return Err(DeError::UnexpectedStart(s.name().as_ref().to_owned())) } diff --git a/src/events/mod.rs b/src/events/mod.rs index e6bdc17a..a3a67b24 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -29,6 +29,8 @@ //! //! See [`Writer`] for further information. //! +//! [`Reader::read_event_into`]: crate::reader::Reader::read_event_into +//! [`Reader`]: crate::reader::Reader //! [`Writer`]: crate::writer::Writer //! [`Event`]: crate::events::Event @@ -44,7 +46,7 @@ use std::str::from_utf8; use crate::errors::{Error, Result}; use crate::escape::{escape, partial_escape, unescape_with}; use crate::name::{LocalName, QName}; -use crate::reader::{Decoder, Reader}; +use crate::reader::Decoder; use crate::utils::write_cow_string; use attributes::{Attribute, Attributes}; @@ -84,9 +86,9 @@ impl<'a> BytesStartText<'a> { /// /// This method does not unescapes content, because no escape sequences can /// appeared in the BOM or in the text before the first tag. - pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result { + pub fn decode_with_bom_removal(&self) -> Result { //TODO: Fix lifetime issue - it should be possible to borrow string - let decoded = decoder.decode_with_bom_removal(&*self)?; + let decoded = self.content.decoder.decode_with_bom_removal(&*self)?; Ok(decoded.to_string()) } @@ -758,59 +760,23 @@ impl<'a> BytesText<'a> { } } - /// Decodes using UTF-8 then unescapes the content of the event. - /// - /// Searches for '&' into content and try to escape the coded character if possible - /// returns Malformed error with index within element if '&' is not followed by ';' - /// - /// See also [`unescape_with()`](Self::unescape_with) - /// - /// This method is available only if `encoding` feature is **not** enabled. - #[cfg(any(doc, not(feature = "encoding")))] - pub fn unescape(&self) -> Result> { - self.unescape_with(|_| None) - } - - /// Decodes using UTF-8 then unescapes the content of the event with custom entities. - /// - /// Searches for '&' into content and try to escape the coded character if possible - /// returns Malformed error with index within element if '&' is not followed by ';' - /// A fallback resolver for additional custom entities can be provided via `resolve_entity`. - /// - /// See also [`unescape()`](Self::unescape) - /// - /// This method is available only if `encoding` feature is **not** enabled. - #[cfg(any(doc, not(feature = "encoding")))] - pub fn unescape_with<'entity>( - &self, - resolve_entity: impl Fn(&str) -> Option<&'entity str>, - ) -> Result> { - // from_utf8 should never fail because content is always UTF-8 encoded - Ok(unescape_with(from_utf8(&self.content)?, resolve_entity)?) - } - /// Decodes then unescapes the content of the event. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. - pub fn decode_and_unescape(&self, reader: &Reader) -> Result> { - self.decode_and_unescape_with(reader, |_| None) + pub fn unescape(&self) -> Result> { + self.unescape_with(|_| None) } /// Decodes then unescapes the content of the event with custom entities. /// /// This will allocate if the value contains any escape sequences or in /// non-UTF-8 encoding. - /// - /// # Pre-condition - /// - /// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs. - pub fn decode_and_unescape_with<'entity, B>( + pub fn unescape_with<'entity>( &self, - reader: &Reader, resolve_entity: impl Fn(&str) -> Option<&'entity str>, ) -> Result> { - let decoded = reader.decoder().decode(&*self)?; + let decoded = self.decoder.decode(&*self)?; match unescape_with(&decoded, resolve_entity)? { // Because result is borrowed, no replacements was done and we can use original string @@ -820,15 +786,15 @@ impl<'a> BytesText<'a> { } /// Gets content of this text buffer in the specified encoding and optionally - /// unescapes it. Unlike [`Self::decode_and_unescape`] & Co., the lifetime + /// unescapes it. Unlike [`Self::unescape`] & Co., the lifetime /// of the returned `Cow` is bound to the original buffer / input #[cfg(feature = "serialize")] - pub(crate) fn decode(&self, decoder: Decoder, unescape: bool) -> Result> { + pub(crate) fn decode(&self, unescape: bool) -> Result> { //TODO: too many copies, can be optimized let text = match &self.content { - Cow::Borrowed(bytes) => decoder.decode(bytes)?, + Cow::Borrowed(bytes) => self.decoder.decode(bytes)?, // Convert to owned, because otherwise Cow will be bound with wrong lifetime - Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(), + Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(), }; let text = if unescape { //FIXME: need to take into account entities defined in the document @@ -930,8 +896,8 @@ impl<'a> BytesCData<'a> { /// | `&` | `&` /// | `'` | `'` /// | `"` | `"` - pub fn escape(self, decoder: Decoder) -> Result> { - let decoded = self.decode(decoder)?; + pub fn escape(self) -> Result> { + let decoded = self.decode()?; Ok(BytesText::wrap( match escape(&decoded) { // Because result is borrowed, no replacements was done and we can use original content @@ -955,8 +921,8 @@ impl<'a> BytesCData<'a> { /// | `<` | `<` /// | `>` | `>` /// | `&` | `&` - pub fn partial_escape(self, decoder: Decoder) -> Result> { - let decoded = self.decode(decoder)?; + pub fn partial_escape(self) -> Result> { + let decoded = self.decode()?; Ok(BytesText::wrap( match partial_escape(&decoded) { // Because result is borrowed, no replacements was done and we can use original content @@ -968,11 +934,11 @@ impl<'a> BytesCData<'a> { } /// Gets content of this text buffer in the specified encoding - pub(crate) fn decode(&self, decoder: Decoder) -> Result> { + pub(crate) fn decode(&self) -> Result> { Ok(match &self.content { - Cow::Borrowed(bytes) => decoder.decode(bytes)?, + Cow::Borrowed(bytes) => self.decoder.decode(bytes)?, // Convert to owned, because otherwise Cow will be bound with wrong lifetime - Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(), + Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(), }) } } @@ -996,6 +962,8 @@ impl<'a> Deref for BytesCData<'a> { //////////////////////////////////////////////////////////////////////////////////////////////////// /// Event emitted by [`Reader::read_event_into`]. +/// +/// [`Reader::read_event_into`]: crate::reader::Reader::read_event_into #[derive(Clone, Debug, Eq, PartialEq)] pub enum Event<'a> { /// Text that appeared before the first opening tag or an [XML declaration]. @@ -1044,6 +1012,7 @@ pub enum Event<'a> { /// /// [XML declaration]: Event::Decl /// [std]: https://www.w3.org/TR/xml11/#NT-document + /// [`Reader`]: crate::reader::Reader /// [`Writer`]: crate::writer::Writer StartText(BytesStartText<'a>), /// Start tag (with attributes) ``. diff --git a/src/reader/buffered_reader.rs b/src/reader/buffered_reader.rs index a6aa95ec..384f71cd 100644 --- a/src/reader/buffered_reader.rs +++ b/src/reader/buffered_reader.rs @@ -48,7 +48,7 @@ impl Reader { /// loop { /// match reader.read_event_into(&mut buf) { /// Ok(Event::Start(ref e)) => count += 1, - /// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), + /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), /// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e), /// Ok(Event::Eof) => break, /// _ => (), @@ -207,7 +207,7 @@ impl Reader { let s = match self.read_event_into(buf) { Err(e) => return Err(e), - Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(), + Ok(Event::Text(e)) => e.unescape()?.into_owned(), Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()), Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())), _ => return Err(Error::TextNotFound), diff --git a/src/reader/mod.rs b/src/reader/mod.rs index fe33b4f4..8fd778b2 100644 --- a/src/reader/mod.rs +++ b/src/reader/mod.rs @@ -269,7 +269,7 @@ impl EncodingRef { /// _ => (), /// } /// } -/// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()), +/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()), /// /// // There are several other `Event`s we do not consider here /// _ => (), diff --git a/src/reader/ns_reader.rs b/src/reader/ns_reader.rs index 48376fb0..7a02492d 100644 --- a/src/reader/ns_reader.rs +++ b/src/reader/ns_reader.rs @@ -329,7 +329,7 @@ impl NsReader { /// } /// } /// Event::Text(e) => { - /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// txt.push(e.unescape().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -388,7 +388,7 @@ impl NsReader { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// txt.push(e.unescape().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), @@ -566,7 +566,7 @@ impl<'i> NsReader<&'i [u8]> { /// } /// } /// Event::Text(e) => { - /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// txt.push(e.unescape().unwrap().into_owned()) /// } /// Event::Eof => break, /// _ => (), @@ -624,7 +624,7 @@ impl<'i> NsReader<&'i [u8]> { /// (_, Event::Start(_)) => unreachable!(), /// /// (_, Event::Text(e)) => { - /// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()) + /// txt.push(e.unescape().unwrap().into_owned()) /// } /// (_, Event::Eof) => break, /// _ => (), diff --git a/tests/test.rs b/tests/test.rs index 5ac9dae8..819f8ab6 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -98,7 +98,7 @@ fn test_koi8_r_encoding() { loop { match r.read_event() { Ok(Text(e)) => { - e.decode_and_unescape(&r).unwrap(); + e.unescape().unwrap(); } Ok(Eof) => break, _ => (), @@ -157,7 +157,7 @@ fn fuzz_101() { } } Ok(Text(e)) => { - if e.decode_and_unescape(&reader).is_err() { + if e.unescape().is_err() { break; } } diff --git a/tests/unit_tests.rs b/tests/unit_tests.rs index 24bcbf08..fdaad2d3 100644 --- a/tests/unit_tests.rs +++ b/tests/unit_tests.rs @@ -506,7 +506,7 @@ fn test_escaped_content() { "content unexpected: expecting '<test>', got '{:?}'", from_utf8(&*e) ); - match e.decode_and_unescape(&r) { + match e.unescape() { Ok(c) => assert_eq!(c, ""), Err(e) => panic!( "cannot escape content at position {}: {:?}", @@ -595,7 +595,7 @@ fn test_read_write_roundtrip_escape_text() -> Result<()> { match reader.read_event()? { Eof => break, Text(e) => { - let t = e.decode_and_unescape(&reader).unwrap(); + let t = e.unescape().unwrap(); assert!(writer .write_event(Text(BytesText::from_plain_str(&t))) .is_ok()); @@ -737,7 +737,7 @@ mod decode_with_bom_removal { loop { match reader.read_event() { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), + Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), Ok(Eof) => break, _ => (), } @@ -760,7 +760,7 @@ mod decode_with_bom_removal { loop { match reader.read_event() { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), + Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), Ok(Eof) => break, _ => (), } @@ -778,7 +778,7 @@ mod decode_with_bom_removal { loop { match reader.read_event() { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), + Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), Ok(Eof) => break, _ => (), } @@ -798,7 +798,7 @@ mod decode_with_bom_removal { loop { match reader.read_event() { - Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()), + Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()), Ok(Eof) => break, _ => (), }