Skip to content

Commit

Permalink
Refactor and rename title/qid wrappers
Browse files Browse the repository at this point in the history
- Move Qid and Title to separate modules
- Reformat benchmark

Signed-off-by: Evan Lloyd New-Schmidt <[email protected]>
  • Loading branch information
newsch committed Aug 10, 2023
1 parent bdf6f1a commit 34bb931
Show file tree
Hide file tree
Showing 6 changed files with 211 additions and 207 deletions.
19 changes: 9 additions & 10 deletions benches/id_parsing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,21 @@ use std::{collections::HashSet, str::FromStr};
extern crate om_wikiparser;
extern crate test;

use om_wikiparser::wm::{Qid, Title};

const TITLE: &str = "https://en.wikipedia.org/wiki/Article_Title";
const QID: &str = "Q123456789";

#[bench]
fn parse_wikipedia(b: &mut test::Bencher) {
b.iter(|| {
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
"https://en.wikipedia.org/wiki/Article_Title",
)
.unwrap();
Title::from_url(TITLE).unwrap();
});
}

#[bench]
fn hash_wikipedia(b: &mut test::Bencher) {
let title = om_wikiparser::wm::WikipediaTitleNorm::from_url(
"https://en.wikipedia.org/wiki/Article_Title",
)
.unwrap();
let title = Title::from_url(TITLE).unwrap();
let mut set = HashSet::new();
b.iter(|| {
set.insert(&title);
Expand All @@ -29,13 +28,13 @@ fn hash_wikipedia(b: &mut test::Bencher) {
#[bench]
fn parse_wikidata(b: &mut test::Bencher) {
b.iter(|| {
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
Qid::from_str(QID).unwrap();
});
}

#[bench]
fn hash_wikidata(b: &mut test::Bencher) {
let qid = om_wikiparser::wm::WikidataQid::from_str("Q123456789").unwrap();
let qid = Qid::from_str(QID).unwrap();
let mut set = HashSet::new();
b.iter(|| {
set.insert(&qid);
Expand Down
6 changes: 3 additions & 3 deletions src/get_articles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use anyhow::{anyhow, bail, Context};

use om_wikiparser::{
html::simplify,
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, WikipediaTitleNorm},
wm::{parse_osm_tag_file, parse_wikidata_file, parse_wikipedia_file, Page, Title},
};

/// Extract, filter, and simplify article HTML from Wikipedia Enterprise HTML dumps.
Expand Down Expand Up @@ -154,7 +154,7 @@ pub fn run(args: Args) -> anyhow::Result<()> {
fn create_article_dir(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
redirects: impl IntoIterator<Item = Title>,
) -> anyhow::Result<PathBuf> {
let base = base.as_ref();
let mut redirects = redirects.into_iter();
Expand Down Expand Up @@ -237,7 +237,7 @@ fn create_article_dir(
fn write(
base: impl AsRef<Path>,
page: &Page,
redirects: impl IntoIterator<Item = WikipediaTitleNorm>,
redirects: impl IntoIterator<Item = Title>,
) -> anyhow::Result<()> {
let article_dir = create_article_dir(base, page, redirects)?;

Expand Down
200 changes: 14 additions & 186 deletions src/wm/mod.rs
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
//! Wikimedia types
use std::{
collections::HashSet, ffi::OsStr, fmt::Display, fs, num::ParseIntError, path::PathBuf,
str::FromStr,
};
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};

use anyhow::{anyhow, bail, Context};

use url::Url;
use anyhow::{anyhow, Context};

mod page;
pub use page::Page;
mod title;
pub use title::*;
mod qid;
pub use qid::*;

/// Read from a file of urls on each line.
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<WikidataQid>> {
pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Qid>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
WikidataQid::from_str(line).with_context(|| {
Qid::from_str(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
Expand All @@ -34,15 +33,13 @@ pub fn parse_wikidata_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Wi
}

/// Read article titles from a file of urls on each line.
pub fn parse_wikipedia_file(
path: impl AsRef<OsStr>,
) -> anyhow::Result<HashSet<WikipediaTitleNorm>> {
pub fn parse_wikipedia_file(path: impl AsRef<OsStr>) -> anyhow::Result<HashSet<Title>> {
let contents = fs::read_to_string(path.as_ref())?;
Ok(contents
.lines()
.enumerate()
.map(|(i, line)| {
WikipediaTitleNorm::from_url(line).with_context(|| {
Title::from_url(line).with_context(|| {
let line_num = i + 1;
format!("on line {line_num}: {line:?}")
})
Expand All @@ -59,8 +56,8 @@ pub fn parse_wikipedia_file(

pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<WikidataQid>,
titles: &mut HashSet<WikipediaTitleNorm>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;
Expand Down Expand Up @@ -93,7 +90,7 @@ pub fn parse_osm_tag_file(

let qid = &row[qid_col].trim();
if !qid.is_empty() {
match WikidataQid::from_str(qid) {
match Qid::from_str(qid) {
Ok(qid) => {
qids.insert(qid);
}
Expand All @@ -109,7 +106,7 @@ pub fn parse_osm_tag_file(

let title = &row[title_col].trim();
if !title.is_empty() {
match WikipediaTitleNorm::from_osm_tag(title) {
match Title::from_osm_tag(title) {
Ok(title) => {
titles.insert(title);
}
Expand All @@ -126,172 +123,3 @@ pub fn parse_osm_tag_file(

Ok(())
}

/// Wikidata QID/Q Number
///
/// See https://www.wikidata.org/wiki/Wikidata:Glossary#QID
///
/// ```
/// use std::str::FromStr;
/// use om_wikiparser::wm::WikidataQid;
///
/// let with_q = WikidataQid::from_str("Q12345").unwrap();
/// let without_q = WikidataQid::from_str(" 12345 ").unwrap();
/// assert_eq!(with_q, without_q);
///
/// assert!(WikidataQid::from_str("q12345").is_ok());
/// assert!(WikidataQid::from_str("https://wikidata.org/wiki/Q12345").is_err());
/// assert!(WikidataQid::from_str("Article_Title").is_err());
/// assert!(WikidataQid::from_str("Q").is_err());
/// assert!(WikidataQid::from_str("").is_err());
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct WikidataQid(u32);

impl FromStr for WikidataQid {
type Err = ParseIntError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim();
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
u32::from_str(s).map(WikidataQid)
}
}

impl Display for WikidataQid {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "Q{}", self.0)
}
}

impl WikidataQid {
pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
path.push("wikidata");
// TODO: can use as_mut_os_string with 1.70.0
path.push(self.to_string());

path
}
}

/// Normalized wikipedia article title that can compare:
/// - titles `Spatial Database`
/// - urls `https://en.wikipedia.org/wiki/Spatial_database#Geodatabase`
/// - osm-style tags `en:Spatial Database`
///
/// ```
/// use om_wikiparser::wm::WikipediaTitleNorm;
///
/// let title = WikipediaTitleNorm::from_title("Article Title", "en").unwrap();
/// let url = WikipediaTitleNorm::from_url("https://en.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let mobile = WikipediaTitleNorm::from_url("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag1 = WikipediaTitleNorm::from_osm_tag("https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// let url_tag2 = WikipediaTitleNorm::from_osm_tag("de:https://en.m.wikipedia.org/wiki/Article_Title#Section").unwrap();
/// assert_eq!(url, title);
/// assert_eq!(url, mobile);
/// assert_eq!(url, url_tag1);
/// assert_eq!(url, url_tag2);
///
/// assert!(WikipediaTitleNorm::from_url("https://en.wikipedia.org/not_a_wiki_page").is_err());
/// assert!(WikipediaTitleNorm::from_url("https://wikidata.org/wiki/Q12345").is_err());
///
/// assert!(
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil/Brigels").unwrap() !=
/// WikipediaTitleNorm::from_url("https://de.wikipedia.org/wiki/Breil").unwrap()
/// );
/// ```
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct WikipediaTitleNorm {
lang: String,
name: String,
}

impl Display for WikipediaTitleNorm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}:{}", self.lang, self.name)
}
}

impl WikipediaTitleNorm {
fn normalize_title(title: &str) -> String {
// TODO: Compare with map generator url creation, ensure covers all cases.
title.trim().replace(' ', "_")
}

// https://en.wikipedia.org/wiki/Article_Title/More_Title
pub fn from_url(url: &str) -> anyhow::Result<Self> {
let url = Url::parse(url.trim())?;

let (subdomain, host) = url
.host_str()
.ok_or_else(|| anyhow!("Expected host"))?
.split_once('.')
.ok_or_else(|| anyhow!("Expected subdomain"))?;
let host = host.strip_prefix("m.").unwrap_or(host);
if host != "wikipedia.org" {
bail!("Expected wikipedia.org for domain")
}
let lang = subdomain;

let path = url.path();

let (root, title) = path
.strip_prefix('/')
.unwrap_or(path)
.split_once('/')
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;

if root != "wiki" {
bail!("Expected 'wiki' as root path, got: {:?}", root)
}
let title = urlencoding::decode(title)?;

Self::from_title(&title, lang)
}

// en:Article Title
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
let (lang, title) = tag
.trim()
.split_once(':')
.ok_or_else(|| anyhow!("Expected ':'"))?;

let lang = lang.trim_start();
let title = title.trim_start();

if matches!(lang, "http" | "https") {
return Self::from_url(tag);
}

if title.starts_with("http://") || title.starts_with("https://") {
return Self::from_url(title);
}

Self::from_title(title, lang)
}

pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
let title = title.trim();
let lang = lang.trim();
if title.is_empty() {
bail!("title cannot be empty or whitespace");
}
if lang.is_empty() {
bail!("lang cannot be empty or whitespace");
}
let name = Self::normalize_title(title);
let lang = lang.to_owned();
Ok(Self { name, lang })
}

pub fn get_dir(&self, base: PathBuf) -> PathBuf {
let mut path = base;
// TODO: can use as_mut_os_string with 1.70.0
path.push(format!("{}.wikipedia.org", self.lang));
path.push("wiki");
path.push(&self.name);

path
}
}
16 changes: 8 additions & 8 deletions src/wm/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::{iter, str::FromStr};

use serde::Deserialize;

use super::{WikidataQid, WikipediaTitleNorm};
use super::{Qid, Title};

// TODO: consolidate into single struct
/// Deserialized Wikimedia Enterprise API Article
Expand All @@ -25,27 +25,27 @@ pub struct Page {
}

impl Page {
pub fn wikidata(&self) -> Option<WikidataQid> {
pub fn wikidata(&self) -> Option<Qid> {
// TODO: return error
self.main_entity
.as_ref()
.map(|e| WikidataQid::from_str(&e.identifier).unwrap())
.map(|e| Qid::from_str(&e.identifier).unwrap())
}

/// Title of the article
pub fn title(&self) -> anyhow::Result<WikipediaTitleNorm> {
WikipediaTitleNorm::from_title(&self.name, &self.in_language.identifier)
pub fn title(&self) -> anyhow::Result<Title> {
Title::from_title(&self.name, &self.in_language.identifier)
}

/// All titles that lead to the article, the main title followed by any redirects.
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
pub fn all_titles(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
iter::once(self.title()).chain(self.redirects())
}

pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<WikipediaTitleNorm>> + '_ {
pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
self.redirects
.iter()
.map(|r| WikipediaTitleNorm::from_title(&r.name, &self.in_language.identifier))
.map(|r| Title::from_title(&r.name, &self.in_language.identifier))
}
}

Expand Down
Loading

0 comments on commit 34bb931

Please sign in to comment.