Refactor and rename title/qid wrappers

- Move Qid and Title to separate modules - Reformat benchmark Signed-off-by: Evan Lloyd New-Schmidt <[email protected]>
organicmaps · Aug 10, 2023 · 34bb931 · 34bb931
1 parent bdf6f1a
commit 34bb931
Show file tree

Hide file tree

Showing 6 changed files with 211 additions and 207 deletions.
diff --git a/benches/id_parsing.rs b/benches/id_parsing.rs
@@ -4,22 +4,21 @@ use std::{collections::HashSet, str::FromStr};
 extern crate om_wikiparser;
 extern crate test;
 
+use om_wikiparser::wm::{Qid, Title};
+
+const TITLE: &str = "https://en.wikipedia.org/wiki/Article_Title";
+const QID: &str = "Q123456789";
+
 #[bench]
 fn parse_wikipedia(b: &mut test::Bencher) {
     b.iter(|| {
-        let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
-            "https://en.wikipedia.org/wiki/Article_Title",
-        )
-        .unwrap();
+        Title::from_url(TITLE).unwrap();
     });
 }
 
 #[bench]
 fn hash_wikipedia(b: &mut test::Bencher) {
-    let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
-        "https://en.wikipedia.org/wiki/Article_Title",
-    )
-    .unwrap();
+    let title = Title::from_url(TITLE).unwrap();
     let mut set = HashSet::new();
     b.iter(|| {
         set.insert(&title);
@@ -29,13 +28,13 @@ fn hash_wikipedia(b: &mut test::Bencher) {
 #[bench]
 fn parse_wikidata(b: &mut test::Bencher) {
     b.iter(|| {
-        let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
+        Qid::from_str(QID).unwrap();
     });
 }
 
 #[bench]
 fn hash_wikidata(b: &mut test::Bencher) {
-    let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
+    let qid = Qid::from_str(QID).unwrap();
     let mut set = HashSet::new();
     b.iter(|| {
         set.insert(&qid);

diff --git a/src/get_articles.rs b/src/get_articles.rs
@@ -9,7 +9,7 @@ use anyhow::{anyhow, bail, Context};
 
 use om_wikiparser::{
     html::simplify,
-    wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
+    wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
 };
 
 /// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
@@ -154,7 +154,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
 fn create_article_dir(
     base: impl AsRef<Path>,
     page: &Page,
-    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
+    redirects: impl IntoIterator<Item = Title>,
 ) -> anyhow::Result<PathBuf> {
     let base = base.as_ref();
     let mut redirects = redirects.into_iter();
@@ -237,7 +237,7 @@ fn create_article_dir(
 fn write(
     base: impl AsRef<Path>,
     page: &Page,
-    redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
+    redirects: impl IntoIterator<Item = Title>,
 ) -> anyhow::Result<()> {
     let article_dir = create_article_dir(base, page, redirects)?;
 

diff --git a/src/wm/mod.rs b/src/wm/mod.rs
@@ -1,24 +1,23 @@
 //! Wikimedia types
-use std::{
-    collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
-    str::FromStr,
-};
+use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
 
-use anyhow::{anyhow, bail, Context};
-
-use url::Url;
+use anyhow::{anyhow, Context};
 
 mod page;
 pub use page::Page;
+mod title;
+pub use title::*;
+mod qid;
+pub use qid::*;
 
 /// Read from a file of urls on each line.
-pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
+pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
     let contents = fs::read_to_string(path.as_ref())?;
     Ok(contents
         .lines()
         .enumerate()
         .map(|(i, line)| {
-            WikidataQid::from_str(line).with_context(|| {
+            Qid::from_str(line).with_context(|| {
                 let line_num = i + 1;
                 format!("on line {line_num}: {line:?}")
             })
@@ -34,15 +33,13 @@ pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Wi
 }
 
 /// Read article titles from a file of urls on each line.
-pub fn parse_wikipedia_file(
-    path: impl AsRef<OsStr>,
-) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
+pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
     let contents = fs::read_to_string(path.as_ref())?;
     Ok(contents
         .lines()
         .enumerate()
         .map(|(i, line)| {
-            WikipediaTitleNorm::from_url(line).with_context(|| {
+            Title::from_url(line).with_context(|| {
                 let line_num = i + 1;
                 format!("on line {line_num}: {line:?}")
             })
@@ -59,8 +56,8 @@ pub fn parse_wikipedia_file(
 
 pub fn parse_osm_tag_file(
     path: impl AsRef<OsStr>,
-    qids: &mut HashSet<WikidataQid>,
-    titles: &mut HashSet<WikipediaTitleNorm>,
+    qids: &mut HashSet<Qid>,
+    titles: &mut HashSet<Title>,
 ) -> anyhow::Result<()> {
     let path = path.as_ref();
     let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
@@ -93,7 +90,7 @@ pub fn parse_osm_tag_file(
 
         let qid = &row[qid_col].trim();
         if !qid.is_empty() {
-            match WikidataQid::from_str(qid) {
+            match Qid::from_str(qid) {
                 Ok(qid) => {
                     qids.insert(qid);
                 }
@@ -109,7 +106,7 @@ pub fn parse_osm_tag_file(
 
         let title = &row[title_col].trim();
         if !title.is_empty() {
-            match WikipediaTitleNorm::from_osm_tag(title) {
+            match Title::from_osm_tag(title) {
                 Ok(title) => {
                     titles.insert(title);
                 }
@@ -126,172 +123,3 @@ pub fn parse_osm_tag_file(
 
     Ok(())
 }
-
-/// Wikidata QID/Q Number
-///
-/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
-///
-/// ```
-/// use std::str::FromStr;
-/// use om_wikiparser::wm::WikidataQid;
-///
-/// let with_q = WikidataQid::from_str("Q12345").unwrap();
-/// let without_q = WikidataQid::from_str(" 12345 ").unwrap();
-/// assert_eq!(with_q, without_q);
-///
-/// assert!(WikidataQid::from_str("q12345").is_ok());
-/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err());
-/// assert!(WikidataQid::from_str("Article_Title").is_err());
-/// assert!(WikidataQid::from_str("Q").is_err());
-/// assert!(WikidataQid::from_str("").is_err());
-/// ```
-#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
-pub struct WikidataQid(u32);
-
-impl FromStr for WikidataQid {
-    type Err = ParseIntError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        let s = s.trim();
-        let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
-        u32::from_str(s).map(WikidataQid)
-    }
-}
-
-impl Display for WikidataQid {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "Q{}", self.0)
-    }
-}
-
-impl WikidataQid {
-    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
-        let mut path = base;
-        path.push("wikidata");
-        // TODO: can use as_mut_os_string with 1.70.0
-        path.push(self.to_string());
-
-        path
-    }
-}
-
-/// Normalized wikipedia article title that can compare:
-/// - titles `Spatial Database`
-/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
-/// - osm-style tags `en:Spatial Database`
-///
-/// ```
-/// use om_wikiparser::wm::WikipediaTitleNorm;
-///
-/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
-/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
-/// let mobile = WikipediaTitleNorm::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
-/// let url_tag1 = WikipediaTitleNorm::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
-/// let url_tag2 = WikipediaTitleNorm::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
-/// assert_eq!(url, title);
-/// assert_eq!(url, mobile);
-/// assert_eq!(url, url_tag1);
-/// assert_eq!(url, url_tag2);
-///
-/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
-/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
-///
-/// assert!(
-///     WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
-///     WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
-/// );
-/// ```
-#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
-pub struct WikipediaTitleNorm {
-    lang: String,
-    name: String,
-}
-
-impl Display for WikipediaTitleNorm {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}:{}", self.lang, self.name)
-    }
-}
-
-impl WikipediaTitleNorm {
-    fn normalize_title(title: &str) -> String {
-        // TODO: Compare with map generator url creation, ensure covers all cases.
-        title.trim().replace(' ', "_")
-    }
-
-    // https://en.wikipedia.org/wiki/Article_Title/More_Title
-    pub fn from_url(url: &str) -> anyhow::Result<Self> {
-        let url = Url::parse(url.trim())?;
-
-        let (subdomain, host) = url
-            .host_str()
-            .ok_or_else(|| anyhow!("Expected host"))?
-            .split_once('.')
-            .ok_or_else(|| anyhow!("Expected subdomain"))?;
-        let host = host.strip_prefix("m.").unwrap_or(host);
-        if host != "wikipedia.org" {
-            bail!("Expected wikipedia.org for domain")
-        }
-        let lang = subdomain;
-
-        let path = url.path();
-
-        let (root, title) = path
-            .strip_prefix('/')
-            .unwrap_or(path)
-            .split_once('/')
-            .ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
-
-        if root != "wiki" {
-            bail!("Expected 'wiki' as root path, got: {:?}", root)
-        }
-        let title = urlencoding::decode(title)?;
-
-        Self::from_title(&title, lang)
-    }
-
-    // en:Article Title
-    pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
-        let (lang, title) = tag
-            .trim()
-            .split_once(':')
-            .ok_or_else(|| anyhow!("Expected ':'"))?;
-
-        let lang = lang.trim_start();
-        let title = title.trim_start();
-
-        if matches!(lang, "http" | "https") {
-            return Self::from_url(tag);
-        }
-
-        if title.starts_with("http://") || title.starts_with("https://") {
-            return Self::from_url(title);
-        }
-
-        Self::from_title(title, lang)
-    }
-
-    pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
-        let title = title.trim();
-        let lang = lang.trim();
-        if title.is_empty() {
-            bail!("title cannot be empty or whitespace");
-        }
-        if lang.is_empty() {
-            bail!("lang cannot be empty or whitespace");
-        }
-        let name = Self::normalize_title(title);
-        let lang = lang.to_owned();
-        Ok(Self { name, lang })
-    }
-
-    pub fn get_dir(&self, base: PathBuf) -> PathBuf {
-        let mut path = base;
-        // TODO: can use as_mut_os_string with 1.70.0
-        path.push(format!("{}.wikipedia.org", self.lang));
-        path.push("wiki");
-        path.push(&self.name);
-
-        path
-    }
-}
diff --git a/src/wm/page.rs b/src/wm/page.rs
@@ -2,7 +2,7 @@ use std::{iter, str::FromStr};
 
 use serde::Deserialize;
 
-use super::{WikidataQid, WikipediaTitleNorm};
+use super::{Qid, Title};
 
 // TODO: consolidate into single struct
 /// Deserialized Wikimedia Enterprise API Article
@@ -25,27 +25,27 @@ pub struct Page {
 }
 
 impl Page {
-    pub fn wikidata(&self) -> Option<WikidataQid> {
+    pub fn wikidata(&self) -> Option<Qid> {
         // TODO: return error
         self.main_entity
             .as_ref()
-            .map(|e| WikidataQid::from_str(&e.identifier).unwrap())
+            .map(|e| Qid::from_str(&e.identifier).unwrap())
     }
 
     /// Title of the article
-    pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
-        WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
+    pub fn title(&self) -> anyhow::Result<Title> {
+        Title::from_title(&self.name, &self.in_language.identifier)
     }
 
     /// All titles that lead to the article, the main title followed by any redirects.
-    pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
+    pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
         iter::once(self.title()).chain(self.redirects())
     }
 
-    pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
+    pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
         self.redirects
             .iter()
-            .map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
+            .map(|r| Title::from_title(&r.name, &self.in_language.identifier))
     }
 }