Skip to content

Commit

Permalink
Use internal decoder of event instead of supplied one in parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
Mingun committed Jul 24, 2022
1 parent 4af1fc4 commit 5467b6c
Show file tree
Hide file tree
Showing 12 changed files with 48 additions and 80 deletions.
2 changes: 2 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@
because writer anyway works in UTF-8 only
- [#428]: Changed the event and `Attributes` constructors to accept a `&str` slices instead of `&[u8]` slices.
Handmade events has always been assumed to store their content UTF-8 encoded.
- [#428]: Removed `Decoder` parameter from `_and_decode` versions of functions for
`BytesText` (remember, that those functions was renamed in #415).

### New Tests

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ loop {
_ => (),
}
}
Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()),
Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),

// There are several other `Event`s we do not consider here
_ => (),
Expand Down
2 changes: 1 addition & 1 deletion benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> {
}
}
Event::Text(e) => {
criterion::black_box(e.decode_and_unescape(&r)?);
criterion::black_box(e.unescape()?);
}
Event::CData(e) => {
criterion::black_box(e.into_inner());
Expand Down
2 changes: 1 addition & 1 deletion benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ fn one_event(c: &mut Criterion) {
.check_comments(false)
.trim_text(true);
match r.read_event_into(&mut buf) {
Ok(Event::Comment(e)) => nbtxt += e.decode_and_unescape(&r).unwrap().len(),
Ok(Event::Comment(e)) => nbtxt += e.unescape().unwrap().len(),
something_else => panic!("Did not expect {:?}", something_else),
};

Expand Down
4 changes: 1 addition & 3 deletions examples/custom_entities.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
Ok(Event::Text(ref e)) => {
println!(
"text value: {}",
e.decode_and_unescape_with(&reader, |ent| custom_entities
.get(ent)
.map(|s| s.as_str()))
e.unescape_with(|ent| custom_entities.get(ent).map(|s| s.as_str()))
.unwrap()
);
}
Expand Down
9 changes: 4 additions & 5 deletions src/de/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -612,16 +612,15 @@ where
unescape: bool,
allow_start: bool,
) -> Result<Cow<'de, str>, DeError> {
let decoder = self.reader.decoder();
match self.next()? {
DeEvent::Text(e) => Ok(e.decode(decoder, unescape)?),
DeEvent::CData(e) => Ok(e.decode(decoder)?),
DeEvent::Text(e) => Ok(e.decode(unescape)?),
DeEvent::CData(e) => Ok(e.decode()?),
DeEvent::Start(e) if allow_start => {
// allow one nested level
let inner = self.next()?;
let t = match inner {
DeEvent::Text(t) => t.decode(decoder, unescape)?,
DeEvent::CData(t) => t.decode(decoder)?,
DeEvent::Text(t) => t.decode(unescape)?,
DeEvent::CData(t) => t.decode()?,
DeEvent::Start(s) => {
return Err(DeError::UnexpectedStart(s.name().as_ref().to_owned()))
}
Expand Down
77 changes: 23 additions & 54 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
//!
//! See [`Writer`] for further information.
//!
//! [`Reader::read_event_into`]: crate::reader::Reader::read_event_into
//! [`Reader`]: crate::reader::Reader
//! [`Writer`]: crate::writer::Writer
//! [`Event`]: crate::events::Event

Expand All @@ -44,7 +46,7 @@ use std::str::from_utf8;
use crate::errors::{Error, Result};
use crate::escape::{escape, partial_escape, unescape_with};
use crate::name::{LocalName, QName};
use crate::reader::{Decoder, Reader};
use crate::reader::Decoder;
use crate::utils::write_cow_string;
use attributes::{Attribute, Attributes};

Expand Down Expand Up @@ -84,9 +86,9 @@ impl<'a> BytesStartText<'a> {
///
/// This method does not unescapes content, because no escape sequences can
/// appeared in the BOM or in the text before the first tag.
pub fn decode_with_bom_removal(&self, decoder: Decoder) -> Result<String> {
pub fn decode_with_bom_removal(&self) -> Result<String> {
//TODO: Fix lifetime issue - it should be possible to borrow string
let decoded = decoder.decode_with_bom_removal(&*self)?;
let decoded = self.content.decoder.decode_with_bom_removal(&*self)?;

Ok(decoded.to_string())
}
Expand Down Expand Up @@ -758,59 +760,23 @@ impl<'a> BytesText<'a> {
}
}

/// Decodes using UTF-8 then unescapes the content of the event.
///
/// Searches for '&' into content and try to escape the coded character if possible
/// returns Malformed error with index within element if '&' is not followed by ';'
///
/// See also [`unescape_with()`](Self::unescape_with)
///
/// This method is available only if `encoding` feature is **not** enabled.
#[cfg(any(doc, not(feature = "encoding")))]
pub fn unescape(&self) -> Result<Cow<str>> {
self.unescape_with(|_| None)
}

/// Decodes using UTF-8 then unescapes the content of the event with custom entities.
///
/// Searches for '&' into content and try to escape the coded character if possible
/// returns Malformed error with index within element if '&' is not followed by ';'
/// A fallback resolver for additional custom entities can be provided via `resolve_entity`.
///
/// See also [`unescape()`](Self::unescape)
///
/// This method is available only if `encoding` feature is **not** enabled.
#[cfg(any(doc, not(feature = "encoding")))]
pub fn unescape_with<'entity>(
&self,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> Result<Cow<str>> {
// from_utf8 should never fail because content is always UTF-8 encoded
Ok(unescape_with(from_utf8(&self.content)?, resolve_entity)?)
}

/// Decodes then unescapes the content of the event.
///
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
pub fn decode_and_unescape<B>(&self, reader: &Reader<B>) -> Result<Cow<str>> {
self.decode_and_unescape_with(reader, |_| None)
pub fn unescape(&self) -> Result<Cow<str>> {
self.unescape_with(|_| None)
}

/// Decodes then unescapes the content of the event with custom entities.
///
/// This will allocate if the value contains any escape sequences or in
/// non-UTF-8 encoding.
///
/// # Pre-condition
///
/// The implementation of `resolve_entity` is expected to operate over UTF-8 inputs.
pub fn decode_and_unescape_with<'entity, B>(
pub fn unescape_with<'entity>(
&self,
reader: &Reader<B>,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
) -> Result<Cow<str>> {
let decoded = reader.decoder().decode(&*self)?;
let decoded = self.decoder.decode(&*self)?;

match unescape_with(&decoded, resolve_entity)? {
// Because result is borrowed, no replacements was done and we can use original string
Expand All @@ -820,15 +786,15 @@ impl<'a> BytesText<'a> {
}

/// Gets content of this text buffer in the specified encoding and optionally
/// unescapes it. Unlike [`Self::decode_and_unescape`] & Co., the lifetime
/// unescapes it. Unlike [`Self::unescape`] & Co., the lifetime
/// of the returned `Cow` is bound to the original buffer / input
#[cfg(feature = "serialize")]
pub(crate) fn decode(&self, decoder: Decoder, unescape: bool) -> Result<Cow<'a, str>> {
pub(crate) fn decode(&self, unescape: bool) -> Result<Cow<'a, str>> {
//TODO: too many copies, can be optimized
let text = match &self.content {
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(),
Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(),
};
let text = if unescape {
//FIXME: need to take into account entities defined in the document
Expand Down Expand Up @@ -930,8 +896,8 @@ impl<'a> BytesCData<'a> {
/// | `&` | `&amp;`
/// | `'` | `&apos;`
/// | `"` | `&quot;`
pub fn escape(self, decoder: Decoder) -> Result<BytesText<'a>> {
let decoded = self.decode(decoder)?;
pub fn escape(self) -> Result<BytesText<'a>> {
let decoded = self.decode()?;
Ok(BytesText::wrap(
match escape(&decoded) {
// Because result is borrowed, no replacements was done and we can use original content
Expand All @@ -955,8 +921,8 @@ impl<'a> BytesCData<'a> {
/// | `<` | `&lt;`
/// | `>` | `&gt;`
/// | `&` | `&amp;`
pub fn partial_escape(self, decoder: Decoder) -> Result<BytesText<'a>> {
let decoded = self.decode(decoder)?;
pub fn partial_escape(self) -> Result<BytesText<'a>> {
let decoded = self.decode()?;
Ok(BytesText::wrap(
match partial_escape(&decoded) {
// Because result is borrowed, no replacements was done and we can use original content
Expand All @@ -968,11 +934,11 @@ impl<'a> BytesCData<'a> {
}

/// Gets content of this text buffer in the specified encoding
pub(crate) fn decode(&self, decoder: Decoder) -> Result<Cow<'a, str>> {
pub(crate) fn decode(&self) -> Result<Cow<'a, str>> {
Ok(match &self.content {
Cow::Borrowed(bytes) => decoder.decode(bytes)?,
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
// Convert to owned, because otherwise Cow will be bound with wrong lifetime
Cow::Owned(bytes) => decoder.decode(bytes)?.into_owned().into(),
Cow::Owned(bytes) => self.decoder.decode(bytes)?.into_owned().into(),
})
}
}
Expand All @@ -996,6 +962,8 @@ impl<'a> Deref for BytesCData<'a> {
////////////////////////////////////////////////////////////////////////////////////////////////////

/// Event emitted by [`Reader::read_event_into`].
///
/// [`Reader::read_event_into`]: crate::reader::Reader::read_event_into
#[derive(Clone, Debug, Eq, PartialEq)]
pub enum Event<'a> {
/// Text that appeared before the first opening tag or an [XML declaration].
Expand Down Expand Up @@ -1044,6 +1012,7 @@ pub enum Event<'a> {
///
/// [XML declaration]: Event::Decl
/// [std]: https://www.w3.org/TR/xml11/#NT-document
/// [`Reader`]: crate::reader::Reader
/// [`Writer`]: crate::writer::Writer
StartText(BytesStartText<'a>),
/// Start tag (with attributes) `<tag attr="value">`.
Expand Down
4 changes: 2 additions & 2 deletions src/reader/buffered_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ impl<R: BufRead> Reader<R> {
/// loop {
/// match reader.read_event_into(&mut buf) {
/// Ok(Event::Start(ref e)) => count += 1,
/// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()),
/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
/// Err(e) => panic!("Error at position {}: {:?}", reader.buffer_position(), e),
/// Ok(Event::Eof) => break,
/// _ => (),
Expand Down Expand Up @@ -207,7 +207,7 @@ impl<R: BufRead> Reader<R> {
let s = match self.read_event_into(buf) {
Err(e) => return Err(e),

Ok(Event::Text(e)) => e.decode_and_unescape(self)?.into_owned(),
Ok(Event::Text(e)) => e.unescape()?.into_owned(),
Ok(Event::End(e)) if e.name() == end => return Ok("".to_string()),
Ok(Event::Eof) => return Err(Error::UnexpectedEof("Text".to_string())),
_ => return Err(Error::TextNotFound),
Expand Down
2 changes: 1 addition & 1 deletion src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ impl EncodingRef {
/// _ => (),
/// }
/// }
/// Ok(Event::Text(e)) => txt.push(e.decode_and_unescape(&reader).unwrap().into_owned()),
/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
///
/// // There are several other `Event`s we do not consider here
/// _ => (),
Expand Down
8 changes: 4 additions & 4 deletions src/reader/ns_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ impl<R: BufRead> NsReader<R> {
/// }
/// }
/// Event::Text(e) => {
/// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned())
/// txt.push(e.unescape().unwrap().into_owned())
/// }
/// Event::Eof => break,
/// _ => (),
Expand Down Expand Up @@ -388,7 +388,7 @@ impl<R: BufRead> NsReader<R> {
/// (_, Event::Start(_)) => unreachable!(),
///
/// (_, Event::Text(e)) => {
/// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned())
/// txt.push(e.unescape().unwrap().into_owned())
/// }
/// (_, Event::Eof) => break,
/// _ => (),
Expand Down Expand Up @@ -566,7 +566,7 @@ impl<'i> NsReader<&'i [u8]> {
/// }
/// }
/// Event::Text(e) => {
/// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned())
/// txt.push(e.unescape().unwrap().into_owned())
/// }
/// Event::Eof => break,
/// _ => (),
Expand Down Expand Up @@ -624,7 +624,7 @@ impl<'i> NsReader<&'i [u8]> {
/// (_, Event::Start(_)) => unreachable!(),
///
/// (_, Event::Text(e)) => {
/// txt.push(e.decode_and_unescape(&reader).unwrap().into_owned())
/// txt.push(e.unescape().unwrap().into_owned())
/// }
/// (_, Event::Eof) => break,
/// _ => (),
Expand Down
4 changes: 2 additions & 2 deletions tests/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ fn test_koi8_r_encoding() {
loop {
match r.read_event() {
Ok(Text(e)) => {
e.decode_and_unescape(&r).unwrap();
e.unescape().unwrap();
}
Ok(Eof) => break,
_ => (),
Expand Down Expand Up @@ -157,7 +157,7 @@ fn fuzz_101() {
}
}
Ok(Text(e)) => {
if e.decode_and_unescape(&reader).is_err() {
if e.unescape().is_err() {
break;
}
}
Expand Down
12 changes: 6 additions & 6 deletions tests/unit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -506,7 +506,7 @@ fn test_escaped_content() {
"content unexpected: expecting '&lt;test&gt;', got '{:?}'",
from_utf8(&*e)
);
match e.decode_and_unescape(&r) {
match e.unescape() {
Ok(c) => assert_eq!(c, "<test>"),
Err(e) => panic!(
"cannot escape content at position {}: {:?}",
Expand Down Expand Up @@ -595,7 +595,7 @@ fn test_read_write_roundtrip_escape_text() -> Result<()> {
match reader.read_event()? {
Eof => break,
Text(e) => {
let t = e.decode_and_unescape(&reader).unwrap();
let t = e.unescape().unwrap();
assert!(writer
.write_event(Text(BytesText::from_plain_str(&t)))
.is_ok());
Expand Down Expand Up @@ -737,7 +737,7 @@ mod decode_with_bom_removal {

loop {
match reader.read_event() {
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()),
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()),
Ok(Eof) => break,
_ => (),
}
Expand All @@ -760,7 +760,7 @@ mod decode_with_bom_removal {

loop {
match reader.read_event() {
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()),
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()),
Ok(Eof) => break,
_ => (),
}
Expand All @@ -778,7 +778,7 @@ mod decode_with_bom_removal {

loop {
match reader.read_event() {
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()),
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()),
Ok(Eof) => break,
_ => (),
}
Expand All @@ -798,7 +798,7 @@ mod decode_with_bom_removal {

loop {
match reader.read_event() {
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal(reader.decoder()).unwrap()),
Ok(StartText(e)) => txt.push(e.decode_with_bom_removal().unwrap()),
Ok(Eof) => break,
_ => (),
}
Expand Down

0 comments on commit 5467b6c

Please sign in to comment.