feat: expanded url normalisation to support more functionality

outlined by IIPC's urlcannon project and setup a baseline for testing in line with the setup in that repo. I'm not sure I agree with all the normalisation they are doing.
mijho · Apr 1, 2024 · 126a979 · 126a979
1 parent f593c45
commit 126a979
Show file tree

Hide file tree

Showing 4 changed files with 283 additions and 16 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -28,4 +28,9 @@ name = "surt"
 path = "src/bin/surt.rs"
 
 [dependencies]
+regex = "1.10.4"
 url = "2.5.0"
+url-escape = "0.1.1"
+
+[dev-dependencies]
+serde_json = "1.0.115"
diff --git a/src/lib.rs b/src/lib.rs
@@ -1,8 +1,12 @@
+use regex::Regex;
 use url::{ParseError, Url};
 
 fn normalize_surt(surt: &str) -> String {
     let mut surt = surt.to_string();
 
+    // decode surt
+    surt = url_escape::decode(&surt).to_string();
+
     // replace whitespace with %20
     surt = surt.replace(' ', "%20");
 
@@ -24,27 +28,52 @@ fn normalize_surt(surt: &str) -> String {
     surt
 }
 
-fn normalize_url(url: &str) -> String {
-    let mut url = url.to_string();
+fn normalize_url(mut parsed: Url) -> String {
+    println!("parsed: {:?}", parsed);
+
+    let session_regexp = Regex::new(r"(?i)(&|^)(?:jsessionid=[0-9a-z$]{10,}|sessionid=[0-9a-z]{16,}|phpsessid=[0-9a-z]{16,}|sid=[0-9a-z]{16,}|aspsessionid[a-z]{8}=[0-9a-z]{16,}|cfid=[0-9]+&cftoken=[0-9a-z-]+)(&|$)").unwrap();
+    // lowercase and sort query parameters
+    if parsed.query().is_some() {
+        let mut query = parsed.query().unwrap().split('&').collect::<Vec<&str>>();
+        query.sort();
+        let mut query = query.join("&").to_lowercase();
+        query = session_regexp.replace_all(&query, "$1$3").to_string();
+        parsed.set_query(Some(&query));
+    }
+
+    let www_regexp = Regex::new(r"^www(\w?)+\.(.*\.+)").unwrap();
+    if parsed.host_str().is_some() {
+        // remove www(ish) subdomain
+        let host_str = parsed.host_str().unwrap();
+        let host_str = www_regexp.replace(host_str, "${2}").to_string();
+
+        // lowercase host
+        let host_str = host_str.to_lowercase();
+
+        parsed.set_host(Some(&host_str)).unwrap();
+    }
+
+    println!("parsed: {:?}", parsed);
 
-    // replace trailing slash
-    if url.ends_with('/') {
+    let mut url = parsed.to_string();
+
+    // replace trailing slash unless it's the root path
+    if url.ends_with('/') && parsed.path() != "/" {
         url.pop();
     }
 
-    // remove www subdomain after scheme
-    // TODO: make this less clunky
-    if url.starts_with("http://www.") {
-        url = url.replacen("http://www.", "http://", 1);
-    } else if url.starts_with("https://www.") {
-        url = url.replacen("https://www.", "https://", 1);
+    // replace trailing ?
+    if url.ends_with('?') {
+        url.pop();
     }
 
+    println!("url: {:?}", url);
     url
 }
 
 pub fn generate_surt(url: &str) -> Result<String, ParseError> {
-    let parsed = Url::parse(&normalize_url(url))?;
+    let mut parsed = Url::parse(url)?;
+    parsed = Url::parse(&normalize_url(parsed))?;
 
     let scheme = parsed.scheme();
     match scheme == "https" || scheme == "http" {
@@ -71,9 +100,8 @@ pub fn generate_surt(url: &str) -> Result<String, ParseError> {
     }
 
     if parsed.query().is_some() {
-        let mut query = parsed.query().unwrap().split('&').collect::<Vec<&str>>();
-        query.sort();
-        surt += &format!("?{}", query.join("&").to_lowercase());
+        let query = parsed.query().unwrap().to_lowercase();
+        surt += &format!("?{}", query);
     }
 
     if parsed.fragment().is_some() {
@@ -89,6 +117,69 @@ pub fn generate_surt(url: &str) -> Result<String, ParseError> {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use serde_json::Value;
+    use std::collections::HashMap;
+    use std::fs::File;
+    use std::io::BufReader;
+
+    fn load_test_data() -> HashMap<String, HashMap<String, String>> {
+        let file = File::open("./test_data/surt.json").unwrap();
+        let reader = BufReader::new(file);
+        let v: Value = serde_json::from_reader(reader).unwrap();
+        v.as_object()
+            .unwrap()
+            .iter()
+            .map(|(k, v)| {
+                let inner_map = v
+                    .as_object()
+                    .unwrap()
+                    .iter()
+                    .map(|(k, v)| (k.clone(), v.as_str().unwrap().to_string()))
+                    .collect();
+                (k.clone(), inner_map)
+            })
+            .collect()
+    }
+
+    #[test]
+    fn test_surt() {
+        let test_data = load_test_data();
+
+        for (section, examples) in test_data {
+            // if section does not include surt case insensitive skip
+            if !section.to_lowercase().contains("surt") {
+                continue;
+            }
+            println!("Testing section: {}", section);
+
+            for (input, expected) in examples {
+                println!("Testing example: {}", input);
+                let surt = generate_surt(&input).unwrap();
+                assert_eq!(surt, expected);
+            }
+        }
+    }
+
+    #[test]
+    fn test_url_normalization() {
+        let test_data = load_test_data();
+
+        for (section, examples) in test_data {
+            // if section does not include url_normalization case insensitive skip
+            if !section.to_lowercase().contains("url_normalization") {
+                continue;
+            }
+            println!("Testing section: {}", section);
+
+            for (input, expected) in examples {
+                println!("Testing example: {}", input);
+                let parsed = Url::parse(&input);
+                println!("parsed: {:?}", parsed);
+                let url = normalize_url(Url::parse(&input).unwrap());
+                assert_eq!(url, expected);
+            }
+        }
+    }
 
     #[test]
     fn test_generate_surt_with_valid_url() {
@@ -186,8 +277,8 @@ mod tests {
 
     #[test]
     fn test_normalize_url_with_www_subdomain_and_https() {
-        let url = "https://www.example.com";
-        let expected = "https://example.com";
+        let url = Url::parse("https://www.example.com").unwrap();
+        let expected = "https://example.com/";
         assert_eq!(normalize_url(url), expected);
     }
 

diff --git a/test_data/surt.json b/test_data/surt.json
@@ -0,0 +1,37 @@
+{
+  "defaultIASurt": {
+    "http://www.archive.org/": "org,archive)/",
+    "http://archive.org/": "org,archive)/",
+    "http://archive.org/goo/": "org,archive)/goo",
+    "http://archive.org/goo/?": "org,archive)/goo",
+    "http://archive.org/goo/?b&a": "org,archive)/goo?a&b",
+    "http://archive.org/goo/?a=2&b&a=1": "org,archive)/goo?a=1&a=2&b",
+    "http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221": "org,archive)/index.php?action=profile;u=4221",
+    "http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2": "com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2"
+  },  
+  "url_normalization.DefaultIA": {
+    "http://www.alexa.com/": "http://alexa.com/",
+    "http://archive.org/index.html": "http://archive.org/index.html",
+    "http://archive.org/index.html?": "http://archive.org/index.html",
+    "http://archive.org/index.html?a=b": "http://archive.org/index.html?a=b",
+    "http://archive.org/index.html?b=b&a=b": "http://archive.org/index.html?a=b&b=b",
+    "http://archive.org/index.html?b=a&b=b&a=b": "http://archive.org/index.html?a=b&b=a&b=b",
+    "http://www34.archive.org/index.html?b=a&b=b&a=b": "http://archive.org/index.html?a=b&b=a&b=b"
+  },
+  "url_normalization.IA": {
+    "http://ARCHIVE.ORG/": "http://archive.org/",
+    "http://www.archive.org:80/": "http://archive.org/",
+    "https://www.archive.org:80/": "https://archive.org:80/",
+    "http://www.archive.org:443/": "http://archive.org:443/",
+    "https://www.archive.org:443/": "https://archive.org/",
+    "http://www.archive.org/big/": "http://archive.org/big",
+    "dns:www.archive.org": "dns:www.archive.org"
+  },
+  "url_normalization.massageHost": {
+    "https://foo.com": "https://foo.com/",
+    "https://www.foo.com": "https://foo.com/",
+    "https://www12.foo.com": "https://foo.com/",
+    "https://www2foo.com": "https://www2foo.com/",
+    "https://www2.www2foo.com": "https://www2foo.com/"
+  }   
+}