Skip to content

Commit

Permalink
feat: expanded url normalisation to support more functionality
Browse files Browse the repository at this point in the history
outlined by IIPC's urlcannon project and setup a baseline for testing
in line with the setup in that repo. I'm not sure I agree with all the
normalisation they are doing.
  • Loading branch information
mijho committed Apr 1, 2024
1 parent f593c45 commit 126a979
Show file tree
Hide file tree
Showing 4 changed files with 283 additions and 16 deletions.
134 changes: 134 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,9 @@ name = "surt"
path = "src/bin/surt.rs"

[dependencies]
regex = "1.10.4"
url = "2.5.0"
url-escape = "0.1.1"

[dev-dependencies]
serde_json = "1.0.115"
123 changes: 107 additions & 16 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
use regex::Regex;
use url::{ParseError, Url};

fn normalize_surt(surt: &str) -> String {
let mut surt = surt.to_string();

// decode surt
surt = url_escape::decode(&surt).to_string();

// replace whitespace with %20
surt = surt.replace(' ', "%20");

Expand All @@ -24,27 +28,52 @@ fn normalize_surt(surt: &str) -> String {
surt
}

fn normalize_url(url: &str) -> String {
let mut url = url.to_string();
fn normalize_url(mut parsed: Url) -> String {
println!("parsed: {:?}", parsed);

let session_regexp = Regex::new(r"(?i)(&|^)(?:jsessionid=[0-9a-z$]{10,}|sessionid=[0-9a-z]{16,}|phpsessid=[0-9a-z]{16,}|sid=[0-9a-z]{16,}|aspsessionid[a-z]{8}=[0-9a-z]{16,}|cfid=[0-9]+&cftoken=[0-9a-z-]+)(&|$)").unwrap();
// lowercase and sort query parameters
if parsed.query().is_some() {
let mut query = parsed.query().unwrap().split('&').collect::<Vec<&str>>();
query.sort();
let mut query = query.join("&").to_lowercase();
query = session_regexp.replace_all(&query, "$1$3").to_string();
parsed.set_query(Some(&query));
}

let www_regexp = Regex::new(r"^www(\w?)+\.(.*\.+)").unwrap();
if parsed.host_str().is_some() {
// remove www(ish) subdomain
let host_str = parsed.host_str().unwrap();
let host_str = www_regexp.replace(host_str, "${2}").to_string();

// lowercase host
let host_str = host_str.to_lowercase();

parsed.set_host(Some(&host_str)).unwrap();
}

println!("parsed: {:?}", parsed);

// replace trailing slash
if url.ends_with('/') {
let mut url = parsed.to_string();

// replace trailing slash unless it's the root path
if url.ends_with('/') && parsed.path() != "/" {
url.pop();
}

// remove www subdomain after scheme
// TODO: make this less clunky
if url.starts_with("http://www.") {
url = url.replacen("http://www.", "http://", 1);
} else if url.starts_with("https://www.") {
url = url.replacen("https://www.", "https://", 1);
// replace trailing ?
if url.ends_with('?') {
url.pop();
}

println!("url: {:?}", url);
url
}

pub fn generate_surt(url: &str) -> Result<String, ParseError> {
let parsed = Url::parse(&normalize_url(url))?;
let mut parsed = Url::parse(url)?;
parsed = Url::parse(&normalize_url(parsed))?;

let scheme = parsed.scheme();
match scheme == "https" || scheme == "http" {
Expand All @@ -71,9 +100,8 @@ pub fn generate_surt(url: &str) -> Result<String, ParseError> {
}

if parsed.query().is_some() {
let mut query = parsed.query().unwrap().split('&').collect::<Vec<&str>>();
query.sort();
surt += &format!("?{}", query.join("&").to_lowercase());
let query = parsed.query().unwrap().to_lowercase();
surt += &format!("?{}", query);
}

if parsed.fragment().is_some() {
Expand All @@ -89,6 +117,69 @@ pub fn generate_surt(url: &str) -> Result<String, ParseError> {
#[cfg(test)]
mod tests {
use super::*;
use serde_json::Value;
use std::collections::HashMap;
use std::fs::File;
use std::io::BufReader;

fn load_test_data() -> HashMap<String, HashMap<String, String>> {
let file = File::open("./test_data/surt.json").unwrap();
let reader = BufReader::new(file);
let v: Value = serde_json::from_reader(reader).unwrap();
v.as_object()
.unwrap()
.iter()
.map(|(k, v)| {
let inner_map = v
.as_object()
.unwrap()
.iter()
.map(|(k, v)| (k.clone(), v.as_str().unwrap().to_string()))
.collect();
(k.clone(), inner_map)
})
.collect()
}

#[test]
fn test_surt() {
let test_data = load_test_data();

for (section, examples) in test_data {
// if section does not include surt case insensitive skip
if !section.to_lowercase().contains("surt") {
continue;
}
println!("Testing section: {}", section);

for (input, expected) in examples {
println!("Testing example: {}", input);
let surt = generate_surt(&input).unwrap();
assert_eq!(surt, expected);
}
}
}

#[test]
fn test_url_normalization() {
let test_data = load_test_data();

for (section, examples) in test_data {
// if section does not include url_normalization case insensitive skip
if !section.to_lowercase().contains("url_normalization") {
continue;
}
println!("Testing section: {}", section);

for (input, expected) in examples {
println!("Testing example: {}", input);
let parsed = Url::parse(&input);
println!("parsed: {:?}", parsed);
let url = normalize_url(Url::parse(&input).unwrap());
assert_eq!(url, expected);
}
}
}

#[test]
fn test_generate_surt_with_valid_url() {
Expand Down Expand Up @@ -186,8 +277,8 @@ mod tests {

#[test]
fn test_normalize_url_with_www_subdomain_and_https() {
let url = "https://www.example.com";
let expected = "https://example.com";
let url = Url::parse("https://www.example.com").unwrap();
let expected = "https://example.com/";
assert_eq!(normalize_url(url), expected);
}

Expand Down
37 changes: 37 additions & 0 deletions test_data/surt.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"defaultIASurt": {
"http://www.archive.org/": "org,archive)/",
"http://archive.org/": "org,archive)/",
"http://archive.org/goo/": "org,archive)/goo",
"http://archive.org/goo/?": "org,archive)/goo",
"http://archive.org/goo/?b&a": "org,archive)/goo?a&b",
"http://archive.org/goo/?a=2&b&a=1": "org,archive)/goo?a=1&a=2&b",
"http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221": "org,archive)/index.php?action=profile;u=4221",
"http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2": "com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2"
},
"url_normalization.DefaultIA": {
"http://www.alexa.com/": "http://alexa.com/",
"http://archive.org/index.html": "http://archive.org/index.html",
"http://archive.org/index.html?": "http://archive.org/index.html",
"http://archive.org/index.html?a=b": "http://archive.org/index.html?a=b",
"http://archive.org/index.html?b=b&a=b": "http://archive.org/index.html?a=b&b=b",
"http://archive.org/index.html?b=a&b=b&a=b": "http://archive.org/index.html?a=b&b=a&b=b",
"http://www34.archive.org/index.html?b=a&b=b&a=b": "http://archive.org/index.html?a=b&b=a&b=b"
},
"url_normalization.IA": {
"http://ARCHIVE.ORG/": "http://archive.org/",
"http://www.archive.org:80/": "http://archive.org/",
"https://www.archive.org:80/": "https://archive.org:80/",
"http://www.archive.org:443/": "http://archive.org:443/",
"https://www.archive.org:443/": "https://archive.org/",
"http://www.archive.org/big/": "http://archive.org/big",
"dns:www.archive.org": "dns:www.archive.org"
},
"url_normalization.massageHost": {
"https://foo.com": "https://foo.com/",
"https://www.foo.com": "https://foo.com/",
"https://www12.foo.com": "https://foo.com/",
"https://www2foo.com": "https://www2foo.com/",
"https://www2.www2foo.com": "https://www2foo.com/"
}
}

0 comments on commit 126a979

Please sign in to comment.