From b236d3aee4d09e456b2bf6b9ee6ced801a698899 Mon Sep 17 00:00:00 2001 From: Jendrik Date: Fri, 13 Sep 2024 21:22:02 +0200 Subject: [PATCH] fix #8 --- src/lib.rs | 70 ++++++++++++++++++++++++++++++++-------- src/rules.rs | 15 ++++----- src/tests/mod.rs | 16 ++++----- tests/test_linkify.rs | 13 ++++---- tests/test_single_url.rs | 19 +++++++++-- 5 files changed, 95 insertions(+), 38 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index cfffe13..6b4b32d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -63,7 +63,7 @@ //! # use clearurls::UrlCleaner; //! # fn main() -> Result<(), clearurls::Error> { //! let cleaner = UrlCleaner::from_embedded_rules()?; -//! let res = cleaner.clear_single_url("https://example.com/test?utm_source=abc")?; +//! let res = cleaner.clear_single_url_str("https://example.com/test?utm_source=abc")?; //! assert_eq!(res, "https://example.com/test"); //! # Ok(()) //! # } @@ -79,9 +79,9 @@ extern crate std; use alloc::borrow::Cow; use core::fmt::{Display, Formatter}; -use core::str::Utf8Error; +use core::str::{FromStr, Utf8Error}; use regex::Regex; -use url::ParseError; +use url::{ParseError, Url}; use rules::Rules; @@ -165,22 +165,65 @@ impl UrlCleaner { /// /// # Errors /// If an error occurred. See the [`Error`] enum for possible reasons. - pub fn clear_single_url<'a>(&self, url: &'a str) -> Result, Error> { + pub fn clear_single_url_str<'a>(&self, url: &'a str) -> Result, Error> { if url.starts_with("data:") { return Ok(Cow::Borrowed(url)); } - let mut result = Cow::Borrowed(url); + let mut result = Url::from_str(url)?; for p in &self.rules.providers { - if p.match_url(&result) { - let cleaned = p.remove_fields_from_url(&result, self.strip_referral_marketing)?; - // TODO get rid of the allocation - result = Cow::Owned(cleaned.into_owned()); + if p.match_url(result.as_str()) { + result = p.remove_fields_from_url(&result, self.strip_referral_marketing)?; } } - Ok(result) + Ok(Cow::Owned(result.into())) } + /// Clean a single URL. + /// + /// The Cleaning may involve + /// - 1. removing tracking parameters + /// and/or, + /// - 2. detecting redirections with the target url in a query parameters + /// + /// # Returns + /// a cleaned URL + /// + /// # Errors + /// If an error occurred. See the [`Error`] enum for possible reasons. + pub fn clear_single_url<'a>(&self, url: &'a Url) -> Result, Error> { + if url.scheme().starts_with("data") { + return Ok(Cow::Borrowed(url)); + } + let mut url = Cow::Borrowed(url); + for p in &self.rules.providers { + if p.match_url(url.as_str()) { + url = Cow::Owned(p.remove_fields_from_url(&url, self.strip_referral_marketing)?); + } + } + + Ok(url) + } + + /// Clean all URLs in a text. + /// + /// This may involve + /// - 1. removing tracking parameters + /// and/or, + /// - 2. detecting redirections with the target url in a query parameters + /// + /// # Returns + /// The string with all URLs inside cleaned. + /// Text outside of URLs is left unchanged. + /// + /// # Errors + /// Alls errors encountered are returned in a [`Vec`]. + #[cfg(feature = "linkify")] + pub fn clear_text<'a>(&self, s: &'a str) -> Result, alloc::vec::Vec> { + self.clear_text_with_linkfinder(s, &linkify::LinkFinder::new()) + } + + /// Clean all URLs in a text. /// /// This may involve @@ -194,9 +237,8 @@ impl UrlCleaner { /// /// # Errors /// Alls errors encountered are returned in a [`Vec`]. - /// ``` #[cfg(feature = "linkify")] - pub fn clear_text<'a>( + pub fn clear_text_with_linkfinder<'a>( &self, s: &'a str, finder: &linkify::LinkFinder, @@ -209,7 +251,7 @@ impl UrlCleaner { for res in finder.spans(s) { match res.kind() { - Some(linkify::LinkKind::Url) => match self.clear_single_url(res.as_str()) { + Some(linkify::LinkKind::Url) => match self.clear_single_url_str(res.as_str()) { Ok(cow) => spans.push(cow), Err(e) => errors.push(e), }, @@ -251,7 +293,7 @@ impl UrlCleaner { use alloc::string::String; fn replace_url(cleaner: &UrlCleaner, url: &mut String) -> Result<(), Error> { - match cleaner.clear_single_url(url)? { + match cleaner.clear_single_url_str(url)? { Cow::Borrowed(_) => {} Cow::Owned(new_url) => { *url = new_url; diff --git a/src/rules.rs b/src/rules.rs index 0f88f47..2b142e5 100644 --- a/src/rules.rs +++ b/src/rules.rs @@ -1,7 +1,6 @@ use alloc::borrow::Cow; use alloc::str::FromStr; use alloc::string::String; -use alloc::string::ToString; use alloc::vec::Vec; use percent_encoding::percent_decode_str; @@ -38,16 +37,16 @@ pub(crate) struct Provider { } impl Provider { - pub(crate) fn remove_fields_from_url<'a>( + pub(crate) fn remove_fields_from_url( &self, - url: &'a str, + url: &Url, strip_referral_marketing: bool, - ) -> Result, Error> { - if let Some(redirect) = self.get_redirection(url)? { + ) -> Result { + if let Some(redirect) = self.get_redirection(url.as_str())? { let url = repeatedly_urldecode(redirect)?; - return Ok(url); + return Ok(Url::from_str(&url)?); }; - let mut url = Cow::Borrowed(url); + let mut url = Cow::Borrowed(url.as_str()); for r in &self.raw_rules { match r.replace_all(&url, "") { Cow::Borrowed(_) => {} @@ -70,7 +69,7 @@ impl Provider { url.set_query(query.as_deref()); url.set_fragment(fragment.as_deref()); - Ok(Cow::Owned(url.to_string())) // I'm sad about the allocation + Ok(url) } pub(crate) fn match_url(&self, url: &str) -> bool { diff --git a/src/tests/mod.rs b/src/tests/mod.rs index c2b2aa5..f3aabd9 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -44,9 +44,9 @@ fn test_strip_referral_marketing() { redirections: vec![], }; let res = provider - .remove_fields_from_url("https://example.com?ref=1", true) + .remove_fields_from_url(&Url::from_str("https://example.com?ref=1").unwrap(), true) .unwrap(); - assert_eq!(res, "https://example.com/"); + assert_eq!(res.as_str(), "https://example.com/"); } //noinspection RegExpSimplifiable @@ -63,7 +63,7 @@ fn test_invalid_redirection() { }; let err = provider .remove_fields_from_url( - "https://google.co.uk/url?foo=bar&q=http%3A%2F%2Fexample.com%2Fimage.png&bar=foo", + &Url::from_str("https://google.co.uk/url?foo=bar&q=http%3A%2F%2Fexample.com%2Fimage.png&bar=foo").unwrap(), false, ) .unwrap_err(); @@ -88,7 +88,7 @@ fn test_invalid_urldecode() { }; // a byte F0 is not valid utf 8 let err = provider - .remove_fields_from_url("https://google.co.uk/url?foo=bar&q=http%F0", false) + .remove_fields_from_url(&Url::from_str("https://google.co.uk/url?foo=bar&q=http%F0").unwrap(), false) .unwrap_err(); assert_matches!(err, PercentDecodeUtf8Error(_)); #[cfg(feature = "std")] @@ -111,8 +111,8 @@ fn test_raw_rules_unchanged() { exceptions: RegexSet::default(), redirections: vec![], }; - let res = provider.remove_fields_from_url("https://pantip.com/", false); - assert_eq!(res.unwrap(), "https://pantip.com/"); + let res = provider.remove_fields_from_url(&Url::from_str("https://pantip.com/").unwrap(), false); + assert_eq!(res.unwrap().as_str(), "https://pantip.com/"); } #[test] @@ -126,7 +126,7 @@ fn test_raw_rules_produce_invalid_url() { redirections: vec![], }; let err = provider - .remove_fields_from_url("https://example.com", false) + .remove_fields_from_url(&Url::from_str("https://example.com").unwrap(), false) .unwrap_err(); assert_matches!(err, Error::UrlSyntax(_)); #[cfg(feature = "std")] @@ -219,7 +219,7 @@ fn test_remove_fields_from_url_errors() { }, strip_referral_marketing: false, }; - let err = provider.clear_single_url("//example.com").unwrap_err(); + let err = provider.clear_single_url_str("//example.com").unwrap_err(); assert_matches!(err, Error::UrlSyntax(_)); #[cfg(feature = "std")] { diff --git a/tests/test_linkify.rs b/tests/test_linkify.rs index ee9a0a2..dc4c583 100644 --- a/tests/test_linkify.rs +++ b/tests/test_linkify.rs @@ -1,16 +1,14 @@ #[cfg(feature = "linkify")] #[test] fn test_linkify() { - use linkify::LinkFinder; - use clearurls::UrlCleaner; use clearurls::Error; + use clearurls::UrlCleaner; let cleaner = UrlCleaner::from_embedded_rules().unwrap(); - let finder = LinkFinder::new(); let test = |msg: &str, input: &str, expected: &str| { let result = cleaner - .clear_text(input, &finder) + .clear_text(input) .unwrap_or_else(|e| panic!("error in test {msg}: {e:?}")); assert_eq!( @@ -31,6 +29,9 @@ fn test_linkify() { "This is a [markdown link](http://example.com/), and another: http://example.com/", ); - let err = cleaner.clear_text("This is a [markdown link](http://example.com/?&&&&), and another: https://google.co.uk/url?foo=bar&q=http%F0", &finder); - assert!(matches!(err.unwrap_err()[..], [Error::PercentDecodeUtf8Error(_)])); + let err = cleaner.clear_text("This is a [markdown link](http://example.com/?&&&&), and another: https://google.co.uk/url?foo=bar&q=http%F0"); + assert!(matches!( + err.unwrap_err()[..], + [Error::PercentDecodeUtf8Error(_)] + )); } diff --git a/tests/test_single_url.rs b/tests/test_single_url.rs index 60aeb0d..af23a6c 100644 --- a/tests/test_single_url.rs +++ b/tests/test_single_url.rs @@ -1,12 +1,17 @@ -use clearurls::UrlCleaner; +use std::str::FromStr; +use url::{ParseError, Url}; +use clearurls::{Error, UrlCleaner}; #[test] fn test_single_url() { let cleaner = UrlCleaner::from_embedded_rules().unwrap(); let test = |original: &str, expected: &str| { - let result = cleaner.clear_single_url(original).unwrap().into_owned(); + let result = cleaner.clear_single_url_str(original).unwrap(); assert_eq!(result, expected); + let url = Url::from_str(original).unwrap(); + let result = cleaner.clear_single_url(&url).unwrap(); + assert_eq!(result.as_str(), expected); }; test( @@ -14,6 +19,16 @@ fn test_single_url() { "https://deezer.com/track/891177062", ); + // url encoded parameter + test( + "https://www.google.com/url?q=https%3A%2F%2Fpypi.org%2Fproject%2FUnalix", + "https://pypi.org/project/Unalix", + ); + + // url encoded parameter that's not a url + assert!(matches!(cleaner.clear_single_url_str("https://www.google.com/url?q=http%3A%2F%2F%5B%3A%3A%3A1%5D").unwrap_err(), Error::UrlSyntax(ParseError::InvalidIpv6Address))); + assert!(matches!(cleaner.clear_single_url(&Url::from_str("https://www.google.com/url?q=http%3A%2F%2F%5B%3A%3A%3A1%5D").unwrap()).unwrap_err(), Error::UrlSyntax(ParseError::InvalidIpv6Address))); + // double url encoded parameter test( "https://www.google.com/url?q=https%253A%252F%252Fpypi.org%252Fproject%252FUnalix",