Skip to content

Commit

Permalink
Improve url handling
Browse files Browse the repository at this point in the history
- Check for urls in osm tags
- Handle mobile urls

Signed-off-by: Evan Lloyd New-Schmidt <[email protected]>
  • Loading branch information
newsch committed Aug 10, 2023
1 parent 6d242a6 commit bdf6f1a
Showing 1 changed file with 26 additions and 2 deletions.
28 changes: 26 additions & 2 deletions src/wm/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ pub fn parse_osm_tag_file(

let title = &row[title_col].trim();
if !title.is_empty() {
match WikipediaTitleNorm::_from_osm_tag(title) {
match WikipediaTitleNorm::from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
}
Expand Down Expand Up @@ -185,7 +185,13 @@ impl WikidataQid {
///
/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let mobile = WikipediaTitleNorm::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag1 = WikipediaTitleNorm::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag2 = WikipediaTitleNorm::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// assert_eq!(url, title);
/// assert_eq!(url, mobile);
/// assert_eq!(url, url_tag1);
/// assert_eq!(url, url_tag2);
///
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
Expand All @@ -201,6 +207,12 @@ pub struct WikipediaTitleNorm {
name: String,
}

impl Display for WikipediaTitleNorm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.lang, self.name)
}
}

impl WikipediaTitleNorm {
fn normalize_title(title: &str) -> String {
// TODO: Compare with map generator url creation, ensure covers all cases.
Expand All @@ -216,6 +228,7 @@ impl WikipediaTitleNorm {
.ok_or_else(|| anyhow!("Expected host"))?
.split_once('.')
.ok_or_else(|| anyhow!("Expected subdomain"))?;
let host = host.strip_prefix("m.").unwrap_or(host);
if host != "wikipedia.org" {
bail!("Expected wikipedia.org for domain")
}
Expand All @@ -238,12 +251,23 @@ impl WikipediaTitleNorm {
}

// en:Article Title
fn _from_osm_tag(tag: &str) -> anyhow::Result<Self> {
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
let (lang, title) = tag
.trim()
.split_once(':')
.ok_or_else(|| anyhow!("Expected ':'"))?;

let lang = lang.trim_start();
let title = title.trim_start();

if matches!(lang, "http" | "https") {
return Self::from_url(tag);
}

if title.starts_with("http://") || title.starts_with("https://") {
return Self::from_url(title);
}

Self::from_title(title, lang)
}

Expand Down

0 comments on commit bdf6f1a

Please sign in to comment.