Skip to content

Commit

Permalink
fix #8
Browse files Browse the repository at this point in the history
  • Loading branch information
jendrikw committed Sep 13, 2024
1 parent bf2bceb commit b236d3a
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 38 deletions.
70 changes: 56 additions & 14 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
//! # use clearurls::UrlCleaner;
//! # fn main() -> Result<(), clearurls::Error> {
//! let cleaner = UrlCleaner::from_embedded_rules()?;
//! let res = cleaner.clear_single_url("https://example.com/test?utm_source=abc")?;
//! let res = cleaner.clear_single_url_str("https://example.com/test?utm_source=abc")?;
//! assert_eq!(res, "https://example.com/test");
//! # Ok(())
//! # }
Expand All @@ -79,9 +79,9 @@ extern crate std;

use alloc::borrow::Cow;
use core::fmt::{Display, Formatter};
use core::str::Utf8Error;
use core::str::{FromStr, Utf8Error};
use regex::Regex;
use url::ParseError;
use url::{ParseError, Url};

use rules::Rules;

Expand Down Expand Up @@ -165,22 +165,65 @@ impl UrlCleaner {
///
/// # Errors
/// If an error occurred. See the [`Error`] enum for possible reasons.
pub fn clear_single_url<'a>(&self, url: &'a str) -> Result<Cow<'a, str>, Error> {
pub fn clear_single_url_str<'a>(&self, url: &'a str) -> Result<Cow<'a, str>, Error> {
if url.starts_with("data:") {
return Ok(Cow::Borrowed(url));
}
let mut result = Cow::Borrowed(url);
let mut result = Url::from_str(url)?;
for p in &self.rules.providers {
if p.match_url(&result) {
let cleaned = p.remove_fields_from_url(&result, self.strip_referral_marketing)?;
// TODO get rid of the allocation
result = Cow::Owned(cleaned.into_owned());
if p.match_url(result.as_str()) {
result = p.remove_fields_from_url(&result, self.strip_referral_marketing)?;
}
}

Ok(result)
Ok(Cow::Owned(result.into()))
}

/// Clean a single URL.
///
/// The Cleaning may involve
/// - 1. removing tracking parameters
/// and/or,
/// - 2. detecting redirections with the target url in a query parameters
///
/// # Returns
/// a cleaned URL
///
/// # Errors
/// If an error occurred. See the [`Error`] enum for possible reasons.
pub fn clear_single_url<'a>(&self, url: &'a Url) -> Result<Cow<'a, Url>, Error> {
if url.scheme().starts_with("data") {
return Ok(Cow::Borrowed(url));
}
let mut url = Cow::Borrowed(url);
for p in &self.rules.providers {
if p.match_url(url.as_str()) {
url = Cow::Owned(p.remove_fields_from_url(&url, self.strip_referral_marketing)?);
}
}

Ok(url)
}

/// Clean all URLs in a text.
///
/// This may involve
/// - 1. removing tracking parameters
/// and/or,
/// - 2. detecting redirections with the target url in a query parameters
///
/// # Returns
/// The string with all URLs inside cleaned.
/// Text outside of URLs is left unchanged.
///
/// # Errors
/// Alls errors encountered are returned in a [`Vec`].
#[cfg(feature = "linkify")]
pub fn clear_text<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, alloc::vec::Vec<Error>> {
self.clear_text_with_linkfinder(s, &linkify::LinkFinder::new())
}


/// Clean all URLs in a text.
///
/// This may involve
Expand All @@ -194,9 +237,8 @@ impl UrlCleaner {
///
/// # Errors
/// Alls errors encountered are returned in a [`Vec`].
/// ```
#[cfg(feature = "linkify")]
pub fn clear_text<'a>(
pub fn clear_text_with_linkfinder<'a>(
&self,
s: &'a str,
finder: &linkify::LinkFinder,
Expand All @@ -209,7 +251,7 @@ impl UrlCleaner {

for res in finder.spans(s) {
match res.kind() {
Some(linkify::LinkKind::Url) => match self.clear_single_url(res.as_str()) {
Some(linkify::LinkKind::Url) => match self.clear_single_url_str(res.as_str()) {
Ok(cow) => spans.push(cow),
Err(e) => errors.push(e),
},
Expand Down Expand Up @@ -251,7 +293,7 @@ impl UrlCleaner {
use alloc::string::String;

fn replace_url(cleaner: &UrlCleaner, url: &mut String) -> Result<(), Error> {
match cleaner.clear_single_url(url)? {
match cleaner.clear_single_url_str(url)? {
Cow::Borrowed(_) => {}
Cow::Owned(new_url) => {
*url = new_url;
Expand Down
15 changes: 7 additions & 8 deletions src/rules.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
use alloc::borrow::Cow;
use alloc::str::FromStr;
use alloc::string::String;
use alloc::string::ToString;
use alloc::vec::Vec;

use percent_encoding::percent_decode_str;
Expand Down Expand Up @@ -38,16 +37,16 @@ pub(crate) struct Provider {
}

impl Provider {
pub(crate) fn remove_fields_from_url<'a>(
pub(crate) fn remove_fields_from_url(
&self,
url: &'a str,
url: &Url,
strip_referral_marketing: bool,
) -> Result<Cow<'a, str>, Error> {
if let Some(redirect) = self.get_redirection(url)? {
) -> Result<Url, Error> {
if let Some(redirect) = self.get_redirection(url.as_str())? {
let url = repeatedly_urldecode(redirect)?;
return Ok(url);
return Ok(Url::from_str(&url)?);
};
let mut url = Cow::Borrowed(url);
let mut url = Cow::Borrowed(url.as_str());
for r in &self.raw_rules {
match r.replace_all(&url, "") {
Cow::Borrowed(_) => {}
Expand All @@ -70,7 +69,7 @@ impl Provider {
url.set_query(query.as_deref());
url.set_fragment(fragment.as_deref());

Ok(Cow::Owned(url.to_string())) // I'm sad about the allocation
Ok(url)
}

pub(crate) fn match_url(&self, url: &str) -> bool {
Expand Down
16 changes: 8 additions & 8 deletions src/tests/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ fn test_strip_referral_marketing() {
redirections: vec![],
};
let res = provider
.remove_fields_from_url("https://example.com?ref=1", true)
.remove_fields_from_url(&Url::from_str("https://example.com?ref=1").unwrap(), true)
.unwrap();
assert_eq!(res, "https://example.com/");
assert_eq!(res.as_str(), "https://example.com/");
}

//noinspection RegExpSimplifiable
Expand All @@ -63,7 +63,7 @@ fn test_invalid_redirection() {
};
let err = provider
.remove_fields_from_url(
"https://google.co.uk/url?foo=bar&q=http%3A%2F%2Fexample.com%2Fimage.png&bar=foo",
&Url::from_str("https://google.co.uk/url?foo=bar&q=http%3A%2F%2Fexample.com%2Fimage.png&bar=foo").unwrap(),
false,
)
.unwrap_err();
Expand All @@ -88,7 +88,7 @@ fn test_invalid_urldecode() {
};
// a byte F0 is not valid utf 8
let err = provider
.remove_fields_from_url("https://google.co.uk/url?foo=bar&q=http%F0", false)
.remove_fields_from_url(&Url::from_str("https://google.co.uk/url?foo=bar&q=http%F0").unwrap(), false)
.unwrap_err();
assert_matches!(err, PercentDecodeUtf8Error(_));
#[cfg(feature = "std")]
Expand All @@ -111,8 +111,8 @@ fn test_raw_rules_unchanged() {
exceptions: RegexSet::default(),
redirections: vec![],
};
let res = provider.remove_fields_from_url("https://pantip.com/", false);
assert_eq!(res.unwrap(), "https://pantip.com/");
let res = provider.remove_fields_from_url(&Url::from_str("https://pantip.com/").unwrap(), false);
assert_eq!(res.unwrap().as_str(), "https://pantip.com/");
}

#[test]
Expand All @@ -126,7 +126,7 @@ fn test_raw_rules_produce_invalid_url() {
redirections: vec![],
};
let err = provider
.remove_fields_from_url("https://example.com", false)
.remove_fields_from_url(&Url::from_str("https://example.com").unwrap(), false)
.unwrap_err();
assert_matches!(err, Error::UrlSyntax(_));
#[cfg(feature = "std")]
Expand Down Expand Up @@ -219,7 +219,7 @@ fn test_remove_fields_from_url_errors() {
},
strip_referral_marketing: false,
};
let err = provider.clear_single_url("//example.com").unwrap_err();
let err = provider.clear_single_url_str("//example.com").unwrap_err();
assert_matches!(err, Error::UrlSyntax(_));
#[cfg(feature = "std")]
{
Expand Down
13 changes: 7 additions & 6 deletions tests/test_linkify.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
#[cfg(feature = "linkify")]
#[test]
fn test_linkify() {
use linkify::LinkFinder;
use clearurls::UrlCleaner;
use clearurls::Error;
use clearurls::UrlCleaner;

let cleaner = UrlCleaner::from_embedded_rules().unwrap();
let finder = LinkFinder::new();

let test = |msg: &str, input: &str, expected: &str| {
let result = cleaner
.clear_text(input, &finder)
.clear_text(input)
.unwrap_or_else(|e| panic!("error in test {msg}: {e:?}"));

assert_eq!(
Expand All @@ -31,6 +29,9 @@ fn test_linkify() {
"This is a [markdown link](http://example.com/), and another: http://example.com/",
);

let err = cleaner.clear_text("This is a [markdown link](http://example.com/?&&&&), and another: https://google.co.uk/url?foo=bar&q=http%F0", &finder);
assert!(matches!(err.unwrap_err()[..], [Error::PercentDecodeUtf8Error(_)]));
let err = cleaner.clear_text("This is a [markdown link](http://example.com/?&&&&), and another: https://google.co.uk/url?foo=bar&q=http%F0");
assert!(matches!(
err.unwrap_err()[..],
[Error::PercentDecodeUtf8Error(_)]
));
}
19 changes: 17 additions & 2 deletions tests/test_single_url.rs
Original file line number Diff line number Diff line change
@@ -1,19 +1,34 @@
use clearurls::UrlCleaner;
use std::str::FromStr;
use url::{ParseError, Url};
use clearurls::{Error, UrlCleaner};

#[test]
fn test_single_url() {
let cleaner = UrlCleaner::from_embedded_rules().unwrap();

let test = |original: &str, expected: &str| {
let result = cleaner.clear_single_url(original).unwrap().into_owned();
let result = cleaner.clear_single_url_str(original).unwrap();
assert_eq!(result, expected);
let url = Url::from_str(original).unwrap();
let result = cleaner.clear_single_url(&url).unwrap();
assert_eq!(result.as_str(), expected);
};

test(
"https://deezer.com/track/891177062?utm_source=deezer",
"https://deezer.com/track/891177062",
);

// url encoded parameter
test(
"https://www.google.com/url?q=https%3A%2F%2Fpypi.org%2Fproject%2FUnalix",
"https://pypi.org/project/Unalix",
);

// url encoded parameter that's not a url
assert!(matches!(cleaner.clear_single_url_str("https://www.google.com/url?q=http%3A%2F%2F%5B%3A%3A%3A1%5D").unwrap_err(), Error::UrlSyntax(ParseError::InvalidIpv6Address)));
assert!(matches!(cleaner.clear_single_url(&Url::from_str("https://www.google.com/url?q=http%3A%2F%2F%5B%3A%3A%3A1%5D").unwrap()).unwrap_err(), Error::UrlSyntax(ParseError::InvalidIpv6Address)));

// double url encoded parameter
test(
"https://www.google.com/url?q=https%253A%252F%252Fpypi.org%252Fproject%252FUnalix",
Expand Down

0 comments on commit b236d3a

Please sign in to comment.