diff --git a/benches/id_parsing.rs b/benches/id_parsing.rs index 57b93fc..df55c9e 100644 --- a/benches/id_parsing.rs +++ b/benches/id_parsing.rs @@ -4,22 +4,21 @@ use std::{collections::HashSet, str::FromStr}; extern crate om_wikiparser; extern crate test; +use om_wikiparser::wm::{Qid, Title}; + +const TITLE: &str = "https://en.wikipedia.org/wiki/Article_Title"; +const QID: &str = "Q123456789"; + #[bench] fn parse_wikipedia(b: &mut test::Bencher) { b.iter(|| { - let title = om_wikiparser::wm::WikipediaTitleNorm::from_url( - "https://en.wikipedia.org/wiki/Article_Title", - ) - .unwrap(); + Title::from_url(TITLE).unwrap(); }); } #[bench] fn hash_wikipedia(b: &mut test::Bencher) { - let title = om_wikiparser::wm::WikipediaTitleNorm::from_url( - "https://en.wikipedia.org/wiki/Article_Title", - ) - .unwrap(); + let title = Title::from_url(TITLE).unwrap(); let mut set = HashSet::new(); b.iter(|| { set.insert(&title); @@ -29,13 +28,13 @@ fn hash_wikipedia(b: &mut test::Bencher) { #[bench] fn parse_wikidata(b: &mut test::Bencher) { b.iter(|| { - let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap(); + Qid::from_str(QID).unwrap(); }); } #[bench] fn hash_wikidata(b: &mut test::Bencher) { - let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap(); + let qid = Qid::from_str(QID).unwrap(); let mut set = HashSet::new(); b.iter(|| { set.insert(&qid); diff --git a/src/get_articles.rs b/src/get_articles.rs index 1e5b7a2..b0cbdab 100644 --- a/src/get_articles.rs +++ b/src/get_articles.rs @@ -9,7 +9,7 @@ use anyhow::{anyhow, bail, Context}; use om_wikiparser::{ html::simplify, - wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm}, + wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title}, }; /// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps. @@ -154,7 +154,7 @@ pub fn run(args: Args) -> anyhow::Result<()> { fn create_article_dir( base: impl AsRef, page: &Page, - redirects: impl IntoIterator, + redirects: impl IntoIterator, ) -> anyhow::Result { let base = base.as_ref(); let mut redirects = redirects.into_iter(); @@ -237,7 +237,7 @@ fn create_article_dir( fn write( base: impl AsRef, page: &Page, - redirects: impl IntoIterator, + redirects: impl IntoIterator, ) -> anyhow::Result<()> { let article_dir = create_article_dir(base, page, redirects)?; diff --git a/src/wm/mod.rs b/src/wm/mod.rs index c2207cf..a78167d 100644 --- a/src/wm/mod.rs +++ b/src/wm/mod.rs @@ -1,24 +1,23 @@ //! Wikimedia types -use std::{ - collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf, - str::FromStr, -}; +use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr}; -use anyhow::{anyhow, bail, Context}; - -use url::Url; +use anyhow::{anyhow, Context}; mod page; pub use page::Page; +mod title; +pub use title::*; +mod qid; +pub use qid::*; /// Read from a file of urls on each line. -pub fn parse_wikidata_file(path: impl AsRef) -> anyhow::Result> { +pub fn parse_wikidata_file(path: impl AsRef) -> anyhow::Result> { let contents = fs::read_to_string(path.as_ref())?; Ok(contents .lines() .enumerate() .map(|(i, line)| { - WikidataQid::from_str(line).with_context(|| { + Qid::from_str(line).with_context(|| { let line_num = i + 1; format!("on line {line_num}: {line:?}") }) @@ -34,15 +33,13 @@ pub fn parse_wikidata_file(path: impl AsRef) -> anyhow::Result, -) -> anyhow::Result> { +pub fn parse_wikipedia_file(path: impl AsRef) -> anyhow::Result> { let contents = fs::read_to_string(path.as_ref())?; Ok(contents .lines() .enumerate() .map(|(i, line)| { - WikipediaTitleNorm::from_url(line).with_context(|| { + Title::from_url(line).with_context(|| { let line_num = i + 1; format!("on line {line_num}: {line:?}") }) @@ -59,8 +56,8 @@ pub fn parse_wikipedia_file( pub fn parse_osm_tag_file( path: impl AsRef, - qids: &mut HashSet, - titles: &mut HashSet, + qids: &mut HashSet, + titles: &mut HashSet, ) -> anyhow::Result<()> { let path = path.as_ref(); let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?; @@ -93,7 +90,7 @@ pub fn parse_osm_tag_file( let qid = &row[qid_col].trim(); if !qid.is_empty() { - match WikidataQid::from_str(qid) { + match Qid::from_str(qid) { Ok(qid) => { qids.insert(qid); } @@ -109,7 +106,7 @@ pub fn parse_osm_tag_file( let title = &row[title_col].trim(); if !title.is_empty() { - match WikipediaTitleNorm::from_osm_tag(title) { + match Title::from_osm_tag(title) { Ok(title) => { titles.insert(title); } @@ -126,172 +123,3 @@ pub fn parse_osm_tag_file( Ok(()) } - -/// Wikidata QID/Q Number -/// -/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID -/// -/// ``` -/// use std::str::FromStr; -/// use om_wikiparser::wm::WikidataQid; -/// -/// let with_q = WikidataQid::from_str("Q12345").unwrap(); -/// let without_q = WikidataQid::from_str(" 12345 ").unwrap(); -/// assert_eq!(with_q, without_q); -/// -/// assert!(WikidataQid::from_str("q12345").is_ok()); -/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err()); -/// assert!(WikidataQid::from_str("Article_Title").is_err()); -/// assert!(WikidataQid::from_str("Q").is_err()); -/// assert!(WikidataQid::from_str("").is_err()); -/// ``` -#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] -pub struct WikidataQid(u32); - -impl FromStr for WikidataQid { - type Err = ParseIntError; - - fn from_str(s: &str) -> Result<Self, Self::Err> { - let s = s.trim(); - let s = s.strip_prefix(['Q', 'q']).unwrap_or(s); - u32::from_str(s).map(WikidataQid) - } -} - -impl Display for WikidataQid { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "Q{}", self.0) - } -} - -impl WikidataQid { - pub fn get_dir(&self, base: PathBuf) -> PathBuf { - let mut path = base; - path.push("wikidata"); - // TODO: can use as_mut_os_string with 1.70.0 - path.push(self.to_string()); - - path - } -} - -/// Normalized wikipedia article title that can compare: -/// - titles `Spatial Database` -/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase` -/// - osm-style tags `en:Spatial Database` -/// -/// ``` -/// use om_wikiparser::wm::WikipediaTitleNorm; -/// -/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap(); -/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap(); -/// let mobile = WikipediaTitleNorm::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap(); -/// let url_tag1 = WikipediaTitleNorm::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap(); -/// let url_tag2 = WikipediaTitleNorm::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap(); -/// assert_eq!(url, title); -/// assert_eq!(url, mobile); -/// assert_eq!(url, url_tag1); -/// assert_eq!(url, url_tag2); -/// -/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err()); -/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err()); -/// -/// assert!( -/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() != -/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap() -/// ); -/// ``` -#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] -pub struct WikipediaTitleNorm { - lang: String, - name: String, -} - -impl Display for WikipediaTitleNorm { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}:{}", self.lang, self.name) - } -} - -impl WikipediaTitleNorm { - fn normalize_title(title: &str) -> String { - // TODO: Compare with map generator url creation, ensure covers all cases. - title.trim().replace(' ', "_") - } - - // https://en.wikipedia.org/wiki/Article_Title/More_Title - pub fn from_url(url: &str) -> anyhow::Result<Self> { - let url = Url::parse(url.trim())?; - - let (subdomain, host) = url - .host_str() - .ok_or_else(|| anyhow!("Expected host"))? - .split_once('.') - .ok_or_else(|| anyhow!("Expected subdomain"))?; - let host = host.strip_prefix("m.").unwrap_or(host); - if host != "wikipedia.org" { - bail!("Expected wikipedia.org for domain") - } - let lang = subdomain; - - let path = url.path(); - - let (root, title) = path - .strip_prefix('/') - .unwrap_or(path) - .split_once('/') - .ok_or_else(|| anyhow!("Expected at least two segments in path"))?; - - if root != "wiki" { - bail!("Expected 'wiki' as root path, got: {:?}", root) - } - let title = urlencoding::decode(title)?; - - Self::from_title(&title, lang) - } - - // en:Article Title - pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> { - let (lang, title) = tag - .trim() - .split_once(':') - .ok_or_else(|| anyhow!("Expected ':'"))?; - - let lang = lang.trim_start(); - let title = title.trim_start(); - - if matches!(lang, "http" | "https") { - return Self::from_url(tag); - } - - if title.starts_with("http://") || title.starts_with("https://") { - return Self::from_url(title); - } - - Self::from_title(title, lang) - } - - pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> { - let title = title.trim(); - let lang = lang.trim(); - if title.is_empty() { - bail!("title cannot be empty or whitespace"); - } - if lang.is_empty() { - bail!("lang cannot be empty or whitespace"); - } - let name = Self::normalize_title(title); - let lang = lang.to_owned(); - Ok(Self { name, lang }) - } - - pub fn get_dir(&self, base: PathBuf) -> PathBuf { - let mut path = base; - // TODO: can use as_mut_os_string with 1.70.0 - path.push(format!("{}.wikipedia.org", self.lang)); - path.push("wiki"); - path.push(&self.name); - - path - } -} diff --git a/src/wm/page.rs b/src/wm/page.rs index 85b6647..c77cf87 100644 --- a/src/wm/page.rs +++ b/src/wm/page.rs @@ -2,7 +2,7 @@ use std::{iter, str::FromStr}; use serde::Deserialize; -use super::{WikidataQid, WikipediaTitleNorm}; +use super::{Qid, Title}; // TODO: consolidate into single struct /// Deserialized Wikimedia Enterprise API Article @@ -25,27 +25,27 @@ pub struct Page { } impl Page { - pub fn wikidata(&self) -> Option<WikidataQid> { + pub fn wikidata(&self) -> Option<Qid> { // TODO: return error self.main_entity .as_ref() - .map(|e| WikidataQid::from_str(&e.identifier).unwrap()) + .map(|e| Qid::from_str(&e.identifier).unwrap()) } /// Title of the article - pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> { - WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier) + pub fn title(&self) -> anyhow::Result<Title> { + Title::from_title(&self.name, &self.in_language.identifier) } /// All titles that lead to the article, the main title followed by any redirects. - pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ { + pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ { iter::once(self.title()).chain(self.redirects()) } - pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ { + pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ { self.redirects .iter() - .map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier)) + .map(|r| Title::from_title(&r.name, &self.in_language.identifier)) } } diff --git a/src/wm/qid.rs b/src/wm/qid.rs new file mode 100644 index 0000000..29fc7d3 --- /dev/null +++ b/src/wm/qid.rs @@ -0,0 +1,51 @@ +use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr}; + +/// Wikidata QID/Q Number +/// +/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID +/// +/// ``` +/// use std::str::FromStr; +/// use om_wikiparser::wm::Qid; +/// +/// let with_q = Qid::from_str("Q12345").unwrap(); +/// let without_q = Qid::from_str(" 12345 ").unwrap(); +/// assert_eq!(with_q, without_q); +/// +/// assert!(Qid::from_str("q12345").is_ok()); +/// assert!(Qid::from_str("https://wikidata.org/wiki/Q12345").is_err()); +/// assert!(Qid::from_str("Article_Title").is_err()); +/// assert!(Qid::from_str("Q").is_err()); +/// assert!(Qid::from_str("").is_err()); +/// ``` +#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct Qid(u32); + +pub type ParseQidError = ParseIntError; + +impl FromStr for Qid { + type Err = ParseQidError; + + fn from_str(s: &str) -> Result<Self, Self::Err> { + let s = s.trim(); + let s = s.strip_prefix(['Q', 'q']).unwrap_or(s); + u32::from_str(s).map(Qid) + } +} + +impl Display for Qid { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Q{}", self.0) + } +} + +impl Qid { + pub fn get_dir(&self, base: PathBuf) -> PathBuf { + let mut path = base; + path.push("wikidata"); + // TODO: can use as_mut_os_string with 1.70.0 + path.push(self.to_string()); + + path + } +} diff --git a/src/wm/title.rs b/src/wm/title.rs new file mode 100644 index 0000000..e06dee0 --- /dev/null +++ b/src/wm/title.rs @@ -0,0 +1,126 @@ +use std::{fmt::Display, path::PathBuf}; + +use anyhow::{anyhow, bail}; + +use url::Url; + +/// Normalized wikipedia article title that can compare: +/// - titles `Spatial Database` +/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase` +/// - osm-style tags `en:Spatial Database` +/// +/// ``` +/// use om_wikiparser::wm::Title; +/// +/// let title = Title::from_title("Article Title", "en").unwrap(); +/// let url = Title::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap(); +/// let mobile = Title::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap(); +/// let url_tag1 = Title::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap(); +/// let url_tag2 = Title::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap(); +/// assert_eq!(url, title); +/// assert_eq!(url, mobile); +/// assert_eq!(url, url_tag1); +/// assert_eq!(url, url_tag2); +/// +/// assert!(Title::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err()); +/// assert!(Title::from_url("https://wikidata.org/wiki/Q12345").is_err()); +/// +/// assert!( +/// Title::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() != +/// Title::from_url("https://de.wikipedia.org/wiki/Breil").unwrap() +/// ); +/// ``` +#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub struct Title { + lang: String, + name: String, +} + +impl Display for Title { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.lang, self.name) + } +} + +impl Title { + fn normalize_title(title: &str) -> String { + // TODO: Compare with map generator url creation, ensure covers all cases. + title.trim().replace(' ', "_") + } + + // https://en.wikipedia.org/wiki/Article_Title/More_Title + pub fn from_url(url: &str) -> anyhow::Result<Self> { + let url = Url::parse(url.trim())?; + + let (subdomain, host) = url + .host_str() + .ok_or_else(|| anyhow!("Expected host"))? + .split_once('.') + .ok_or_else(|| anyhow!("Expected subdomain"))?; + let host = host.strip_prefix("m.").unwrap_or(host); + if host != "wikipedia.org" { + bail!("Expected wikipedia.org for domain") + } + let lang = subdomain; + + let path = url.path(); + + let (root, title) = path + .strip_prefix('/') + .unwrap_or(path) + .split_once('/') + .ok_or_else(|| anyhow!("Expected at least two segments in path"))?; + + if root != "wiki" { + bail!("Expected 'wiki' as root path, got: {:?}", root) + } + let title = urlencoding::decode(title)?; + + Self::from_title(&title, lang) + } + + // en:Article Title + pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> { + let (lang, title) = tag + .trim() + .split_once(':') + .ok_or_else(|| anyhow!("Expected ':'"))?; + + let lang = lang.trim_start(); + let title = title.trim_start(); + + if matches!(lang, "http" | "https") { + return Self::from_url(tag); + } + + if title.starts_with("http://") || title.starts_with("https://") { + return Self::from_url(title); + } + + Self::from_title(title, lang) + } + + pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> { + let title = title.trim(); + let lang = lang.trim(); + if title.is_empty() { + bail!("title cannot be empty or whitespace"); + } + if lang.is_empty() { + bail!("lang cannot be empty or whitespace"); + } + let name = Self::normalize_title(title); + let lang = lang.to_owned(); + Ok(Self { name, lang }) + } + + pub fn get_dir(&self, base: PathBuf) -> PathBuf { + let mut path = base; + // TODO: can use as_mut_os_string with 1.70.0 + path.push(format!("{}.wikipedia.org", self.lang)); + path.push("wiki"); + path.push(&self.name); + + path + } +}