fix #8

jendrikw · Sep 13, 2024 · b236d3a · b236d3a
1 parent bf2bceb
commit b236d3a
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 38 deletions.
diff --git a/src/lib.rs b/src/lib.rs
@@ -63,7 +63,7 @@
 //! # use clearurls::UrlCleaner;
 //! # fn main() -> Result<(), clearurls::Error> {
 //! let cleaner = UrlCleaner::from_embedded_rules()?;
-//! let res = cleaner.clear_single_url("https://example.com/test?utm_source=abc")?;
+//! let res = cleaner.clear_single_url_str("https://example.com/test?utm_source=abc")?;
 //! assert_eq!(res, "https://example.com/test");
 //! # Ok(())
 //! # }
@@ -79,9 +79,9 @@ extern crate std;
 
 use alloc::borrow::Cow;
 use core::fmt::{Display, Formatter};
-use core::str::Utf8Error;
+use core::str::{FromStr, Utf8Error};
 use regex::Regex;
-use url::ParseError;
+use url::{ParseError, Url};
 
 use rules::Rules;
 
@@ -165,22 +165,65 @@ impl UrlCleaner {
     ///
     /// # Errors
     /// If an error occurred. See the [`Error`] enum for possible reasons.
-    pub fn clear_single_url<'a>(&self, url: &'a str) -> Result<Cow<'a, str>, Error> {
+    pub fn clear_single_url_str<'a>(&self, url: &'a str) -> Result<Cow<'a, str>, Error> {
         if url.starts_with("data:") {
             return Ok(Cow::Borrowed(url));
         }
-        let mut result = Cow::Borrowed(url);
+        let mut result = Url::from_str(url)?;
         for p in &self.rules.providers {
-            if p.match_url(&result) {
-                let cleaned = p.remove_fields_from_url(&result, self.strip_referral_marketing)?;
-                // TODO get rid of the allocation
-                result = Cow::Owned(cleaned.into_owned());
+            if p.match_url(result.as_str()) {
+                result = p.remove_fields_from_url(&result, self.strip_referral_marketing)?;
             }
         }
 
-        Ok(result)
+        Ok(Cow::Owned(result.into()))
     }
 
+    /// Clean a single URL.
+    ///
+    /// The Cleaning may involve
+    /// - 1. removing tracking parameters
+    ///      and/or,
+    /// - 2. detecting redirections with the target url in a query parameters
+    ///
+    /// # Returns
+    /// a cleaned URL
+    ///
+    /// # Errors
+    /// If an error occurred. See the [`Error`] enum for possible reasons.
+    pub fn clear_single_url<'a>(&self, url: &'a Url) -> Result<Cow<'a, Url>, Error> {
+        if url.scheme().starts_with("data") {
+            return Ok(Cow::Borrowed(url));
+        }
+        let mut url = Cow::Borrowed(url);
+        for p in &self.rules.providers {
+            if p.match_url(url.as_str()) {
+                url = Cow::Owned(p.remove_fields_from_url(&url, self.strip_referral_marketing)?);
+            }
+        }
+
+        Ok(url)
+    }
+
+    /// Clean all URLs in a text.
+    ///
+    /// This may involve
+    /// - 1. removing tracking parameters
+    ///      and/or,
+    /// - 2. detecting redirections with the target url in a query parameters
+    ///
+    /// # Returns
+    /// The string with all URLs inside cleaned.
+    /// Text outside of URLs is left unchanged.
+    ///
+    /// # Errors
+    /// Alls errors encountered are returned in a [`Vec`].
+    #[cfg(feature = "linkify")]
+    pub fn clear_text<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, alloc::vec::Vec<Error>> {
+        self.clear_text_with_linkfinder(s, &linkify::LinkFinder::new())
+    }
+
+
     /// Clean all URLs in a text.
     ///
     /// This may involve
@@ -194,9 +237,8 @@ impl UrlCleaner {
     ///
     /// # Errors
     /// Alls errors encountered are returned in a [`Vec`].
-    /// ```
     #[cfg(feature = "linkify")]
-    pub fn clear_text<'a>(
+    pub fn clear_text_with_linkfinder<'a>(
         &self,
         s: &'a str,
         finder: &linkify::LinkFinder,
@@ -209,7 +251,7 @@ impl UrlCleaner {
 
         for res in finder.spans(s) {
             match res.kind() {
-                Some(linkify::LinkKind::Url) => match self.clear_single_url(res.as_str()) {
+                Some(linkify::LinkKind::Url) => match self.clear_single_url_str(res.as_str()) {
                     Ok(cow) => spans.push(cow),
                     Err(e) => errors.push(e),
                 },
@@ -251,7 +293,7 @@ impl UrlCleaner {
         use alloc::string::String;
 
         fn replace_url(cleaner: &UrlCleaner, url: &mut String) -> Result<(), Error> {
-            match cleaner.clear_single_url(url)? {
+            match cleaner.clear_single_url_str(url)? {
                 Cow::Borrowed(_) => {}
                 Cow::Owned(new_url) => {
                     *url = new_url;

diff --git a/src/rules.rs b/src/rules.rs
@@ -1,7 +1,6 @@
 use alloc::borrow::Cow;
 use alloc::str::FromStr;
 use alloc::string::String;
-use alloc::string::ToString;
 use alloc::vec::Vec;
 
 use percent_encoding::percent_decode_str;
@@ -38,16 +37,16 @@ pub(crate) struct Provider {
 }
 
 impl Provider {
-    pub(crate) fn remove_fields_from_url<'a>(
+    pub(crate) fn remove_fields_from_url(
         &self,
-        url: &'a str,
+        url: &Url,
         strip_referral_marketing: bool,
-    ) -> Result<Cow<'a, str>, Error> {
-        if let Some(redirect) = self.get_redirection(url)? {
+    ) -> Result<Url, Error> {
+        if let Some(redirect) = self.get_redirection(url.as_str())? {
             let url = repeatedly_urldecode(redirect)?;
-            return Ok(url);
+            return Ok(Url::from_str(&url)?);
         };
-        let mut url = Cow::Borrowed(url);
+        let mut url = Cow::Borrowed(url.as_str());
         for r in &self.raw_rules {
             match r.replace_all(&url, "") {
                 Cow::Borrowed(_) => {}
@@ -70,7 +69,7 @@ impl Provider {
         url.set_query(query.as_deref());
         url.set_fragment(fragment.as_deref());
 
-        Ok(Cow::Owned(url.to_string())) // I'm sad about the allocation
+        Ok(url)
     }
 
     pub(crate) fn match_url(&self, url: &str) -> bool {

diff --git a/src/tests/mod.rs b/src/tests/mod.rs
@@ -44,9 +44,9 @@ fn test_strip_referral_marketing() {
         redirections: vec![],
     };
     let res = provider
-        .remove_fields_from_url("https://example.com?ref=1", true)
+        .remove_fields_from_url(&Url::from_str("https://example.com?ref=1").unwrap(), true)
         .unwrap();
-    assert_eq!(res, "https://example.com/");
+    assert_eq!(res.as_str(), "https://example.com/");
 }
 
 //noinspection RegExpSimplifiable
@@ -63,7 +63,7 @@ fn test_invalid_redirection() {
     };
     let err = provider
         .remove_fields_from_url(
-            "https://google.co.uk/url?foo=bar&q=http%3A%2F%2Fexample.com%2Fimage.png&bar=foo",
+            &Url::from_str("https://google.co.uk/url?foo=bar&q=http%3A%2F%2Fexample.com%2Fimage.png&bar=foo").unwrap(),
             false,
         )
         .unwrap_err();
@@ -88,7 +88,7 @@ fn test_invalid_urldecode() {
     };
     // a byte F0 is not valid utf 8
     let err = provider
-        .remove_fields_from_url("https://google.co.uk/url?foo=bar&q=http%F0", false)
+        .remove_fields_from_url(&Url::from_str("https://google.co.uk/url?foo=bar&q=http%F0").unwrap(), false)
         .unwrap_err();
     assert_matches!(err, PercentDecodeUtf8Error(_));
     #[cfg(feature = "std")]
@@ -111,8 +111,8 @@ fn test_raw_rules_unchanged() {
         exceptions: RegexSet::default(),
         redirections: vec![],
     };
-    let res = provider.remove_fields_from_url("https://pantip.com/", false);
-    assert_eq!(res.unwrap(), "https://pantip.com/");
+    let res = provider.remove_fields_from_url(&Url::from_str("https://pantip.com/").unwrap(), false);
+    assert_eq!(res.unwrap().as_str(), "https://pantip.com/");
 }
 
 #[test]
@@ -126,7 +126,7 @@ fn test_raw_rules_produce_invalid_url() {
         redirections: vec![],
     };
     let err = provider
-        .remove_fields_from_url("https://example.com", false)
+        .remove_fields_from_url(&Url::from_str("https://example.com").unwrap(), false)
         .unwrap_err();
     assert_matches!(err, Error::UrlSyntax(_));
     #[cfg(feature = "std")]
@@ -219,7 +219,7 @@ fn test_remove_fields_from_url_errors() {
         },
         strip_referral_marketing: false,
     };
-    let err = provider.clear_single_url("//example.com").unwrap_err();
+    let err = provider.clear_single_url_str("//example.com").unwrap_err();
     assert_matches!(err, Error::UrlSyntax(_));
     #[cfg(feature = "std")]
     {

diff --git a/tests/test_linkify.rs b/tests/test_linkify.rs
@@ -1,16 +1,14 @@
 #[cfg(feature = "linkify")]
 #[test]
 fn test_linkify() {
-    use linkify::LinkFinder;
-    use clearurls::UrlCleaner;
     use clearurls::Error;
+    use clearurls::UrlCleaner;
 
     let cleaner = UrlCleaner::from_embedded_rules().unwrap();
-    let finder = LinkFinder::new();
 
     let test = |msg: &str, input: &str, expected: &str| {
         let result = cleaner
-            .clear_text(input, &finder)
+            .clear_text(input)
             .unwrap_or_else(|e| panic!("error in test {msg}: {e:?}"));
 
         assert_eq!(
@@ -31,6 +29,9 @@ fn test_linkify() {
         "This is a [markdown link](http://example.com/), and another: http://example.com/",
     );
 
-    let err = cleaner.clear_text("This is a [markdown link](http://example.com/?&&&&), and another: https://google.co.uk/url?foo=bar&q=http%F0", &finder);
-    assert!(matches!(err.unwrap_err()[..], [Error::PercentDecodeUtf8Error(_)]));
+    let err = cleaner.clear_text("This is a [markdown link](http://example.com/?&&&&), and another: https://google.co.uk/url?foo=bar&q=http%F0");
+    assert!(matches!(
+        err.unwrap_err()[..],
+        [Error::PercentDecodeUtf8Error(_)]
+    ));
 }
diff --git a/tests/test_single_url.rs b/tests/test_single_url.rs
@@ -1,19 +1,34 @@
-use clearurls::UrlCleaner;
+use std::str::FromStr;
+use url::{ParseError, Url};
+use clearurls::{Error, UrlCleaner};
 
 #[test]
 fn test_single_url() {
     let cleaner = UrlCleaner::from_embedded_rules().unwrap();
 
     let test = |original: &str, expected: &str| {
-        let result = cleaner.clear_single_url(original).unwrap().into_owned();
+        let result = cleaner.clear_single_url_str(original).unwrap();
         assert_eq!(result, expected);
+        let url = Url::from_str(original).unwrap();
+        let result = cleaner.clear_single_url(&url).unwrap();
+        assert_eq!(result.as_str(), expected);
     };
 
     test(
         "https://deezer.com/track/891177062?utm_source=deezer",
         "https://deezer.com/track/891177062",
     );
 
+    // url encoded parameter
+    test(
+        "https://www.google.com/url?q=https%3A%2F%2Fpypi.org%2Fproject%2FUnalix",
+        "https://pypi.org/project/Unalix",
+    );
+
+    // url encoded parameter that's not a url
+    assert!(matches!(cleaner.clear_single_url_str("https://www.google.com/url?q=http%3A%2F%2F%5B%3A%3A%3A1%5D").unwrap_err(), Error::UrlSyntax(ParseError::InvalidIpv6Address)));
+    assert!(matches!(cleaner.clear_single_url(&Url::from_str("https://www.google.com/url?q=http%3A%2F%2F%5B%3A%3A%3A1%5D").unwrap()).unwrap_err(), Error::UrlSyntax(ParseError::InvalidIpv6Address)));
+
     // double url encoded parameter
     test(
         "https://www.google.com/url?q=https%253A%252F%252Fpypi.org%252Fproject%252FUnalix",