Skip to content

Commit

Permalink
Structure parse errors and only log warning if above threshold
Browse files Browse the repository at this point in the history
- Add custom error types with `thiserror` crate in preparation for #25.
- Parsing errors are captured instead of logged to `warn` by default.
    - All parsing errors are still logged to `debug` level.
    - If >= 0.02% of tags can't be parsed, an error is logged.
    - TSV line errors are always logged as errors.
    - I/O errors will fail instead of be logged.

Signed-off-by: Evan Lloyd New-Schmidt <[email protected]>
  • Loading branch information
newsch committed Aug 10, 2023
1 parent 34bb931 commit 941d2b1
Show file tree
Hide file tree
Showing 7 changed files with 155 additions and 40 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ rayon = "1.7.0"
scraper = "0.16.0"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"
thiserror = "1.0.44"
url = "2.3.1"
urlencoding = "2.1.2"

Expand Down
29 changes: 27 additions & 2 deletions src/get_articles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,34 @@ pub fn run(args: Args) -> anyhow::Result<()> {
Default::default()
};

if let Some(path) = args.osm_tags {
if let Some(ref path) = args.osm_tags {
info!("Loading wikipedia/wikidata osm tags from {path:?}");
parse_osm_tag_file(path, &mut wikidata_qids, &mut wikipedia_titles)?;

let original_items = wikidata_qids.len() + wikipedia_titles.len();
let mut line_errors = Vec::new();
parse_osm_tag_file(
path,
&mut wikidata_qids,
&mut wikipedia_titles,
Some(&mut line_errors),
)?;

if !line_errors.is_empty() {
let error_count = line_errors.len();
let new_items = wikidata_qids.len() + wikipedia_titles.len() - original_items;
let expected_threshold = 0.02;
let percentage = 100.0 * error_count as f64 / new_items as f64;
let level = if percentage >= expected_threshold {
log::Level::Error
} else {
log::Level::Info
};

log!(
level,
"{error_count} errors ({percentage:.4}%) parsing osm tags from {path:?}",
);
}
}

debug!("Parsed {} unique article titles", wikipedia_titles.len());
Expand Down
82 changes: 65 additions & 17 deletions src/wm/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//! Wikimedia types
use std::{collections::HashSet, ffi::OsStr, fs, str::FromStr};
use std::{collections::HashSet, error::Error, ffi::OsStr, fmt::Display, fs, str::FromStr};

use anyhow::{anyhow, Context};
use anyhow::{anyhow, bail, Context};

mod page;
pub use page::Page;
Expand Down Expand Up @@ -58,10 +58,18 @@ pub fn parse_osm_tag_file(
path: impl AsRef<OsStr>,
qids: &mut HashSet<Qid>,
titles: &mut HashSet<Title>,
mut line_errors: Option<&mut Vec<ParseLineError>>,
) -> anyhow::Result<()> {
let path = path.as_ref();
let mut rdr = csv::ReaderBuilder::new().delimiter(b'\t').from_path(path)?;

let mut push_error = |e: ParseLineError| {
debug!("Tag parse error: {e}");
if let Some(ref mut errs) = line_errors {
errs.push(e);
}
};

let mut qid_col = None;
let mut title_col = None;
for (column, title) in rdr.headers()?.iter().enumerate() {
Expand All @@ -83,7 +91,14 @@ pub fn parse_osm_tag_file(
Ok(false) => break,
// attempt to recover from parsing errors
Err(e) => {
error!("Error parsing tsv file: {}", e);
if e.is_io_error() {
bail!(e)
}
push_error(ParseLineError {
text: String::new(),
line: rdr.position().line(),
kind: e.into(),
});
continue;
}
}
Expand All @@ -94,13 +109,11 @@ pub fn parse_osm_tag_file(
Ok(qid) => {
qids.insert(qid);
}
Err(e) => warn!(
"Cannot parse qid {:?} on line {} in {:?}: {}",
qid,
rdr.position().line(),
path,
e
),
Err(e) => push_error(ParseLineError {
text: qid.to_string(),
line: rdr.position().line(),
kind: e.into(),
}),
}
}

Expand All @@ -110,16 +123,51 @@ pub fn parse_osm_tag_file(
Ok(title) => {
titles.insert(title);
}
Err(e) => warn!(
"Cannot parse title {:?} on line {} in {:?}: {}",
title,
rdr.position().line(),
path,
e
),
Err(e) => push_error(ParseLineError {
text: title.to_string(),
line: rdr.position().line(),
kind: e.into(),
}),
}
}
}

Ok(())
}

#[derive(Debug, thiserror::Error)]
pub enum ParseErrorKind {
#[error("bad title")]
Title(#[from] ParseTitleError),
#[error("bad QID")]
Qid(#[from] ParseQidError),
#[error("bad TSV line")]
Tsv(#[from] csv::Error),
}

#[derive(Debug)]
pub struct ParseLineError {
text: String,
line: u64,
kind: ParseErrorKind,
}

impl Display for ParseLineError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// write source chain to ensure they are logged
write!(f, "on line {}: {:?}: {}", self.line, self.text, self.kind)?;
let mut source = self.kind.source();
while let Some(e) = source {
write!(f, ": {}", e)?;
source = e.source();
}
Ok(())
}
}

impl Error for ParseLineError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
// return nothing b/c Display prints source chain
None
}
}
9 changes: 6 additions & 3 deletions src/wm/page.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::{iter, str::FromStr};

use anyhow::Context;
use serde::Deserialize;

use super::{Qid, Title};
Expand Down Expand Up @@ -35,6 +36,7 @@ impl Page {
/// Title of the article
pub fn title(&self) -> anyhow::Result<Title> {
Title::from_title(&self.name, &self.in_language.identifier)
.with_context(|| format!("bad title {:?}", self.name))
}

/// All titles that lead to the article, the main title followed by any redirects.
Expand All @@ -43,9 +45,10 @@ impl Page {
}

pub fn redirects(&self) -> impl Iterator<Item = anyhow::Result<Title>> + '_ {
self.redirects
.iter()
.map(|r| Title::from_title(&r.name, &self.in_language.identifier))
self.redirects.iter().map(|r| {
Title::from_title(&r.name, &self.in_language.identifier)
.with_context(|| format!("bad redirect {:?}", self.name))
})
}
}

Expand Down
21 changes: 17 additions & 4 deletions src/wm/qid.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
use std::{error::Error, fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};

/// Wikidata QID/Q Number
///
Expand All @@ -21,15 +21,13 @@ use std::{fmt::Display, num::ParseIntError, path::PathBuf, str::FromStr};
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct Qid(u32);

pub type ParseQidError = ParseIntError;

impl FromStr for Qid {
type Err = ParseQidError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
let s = s.trim();
let s = s.strip_prefix(['Q', 'q']).unwrap_or(s);
u32::from_str(s).map(Qid)
u32::from_str(s).map(Qid).map_err(ParseQidError)
}
}

Expand All @@ -49,3 +47,18 @@ impl Qid {
path
}
}

#[derive(Debug, PartialEq, Eq)]
pub struct ParseQidError(ParseIntError);

impl Display for ParseQidError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}

impl Error for ParseQidError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
self.0.source()
}
}
52 changes: 38 additions & 14 deletions src/wm/title.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
use std::{fmt::Display, path::PathBuf};

use anyhow::{anyhow, bail};
use std::{fmt::Display, path::PathBuf, string::FromUtf8Error};

use url::Url;

Expand Down Expand Up @@ -49,17 +47,17 @@ impl Title {
}

// https://en.wikipedia.org/wiki/Article_Title/More_Title
pub fn from_url(url: &str) -> anyhow::Result<Self> {
pub fn from_url(url: &str) -> Result<Self, ParseTitleError> {
let url = Url::parse(url.trim())?;

let (subdomain, host) = url
.host_str()
.ok_or_else(|| anyhow!("Expected host"))?
.ok_or(ParseTitleError::NoHost)?
.split_once('.')
.ok_or_else(|| anyhow!("Expected subdomain"))?;
.ok_or(ParseTitleError::NoSubdomain)?;
let host = host.strip_prefix("m.").unwrap_or(host);
if host != "wikipedia.org" {
bail!("Expected wikipedia.org for domain")
return Err(ParseTitleError::BadDomain);
}
let lang = subdomain;

Expand All @@ -69,22 +67,22 @@ impl Title {
.strip_prefix('/')
.unwrap_or(path)
.split_once('/')
.ok_or_else(|| anyhow!("Expected at least two segments in path"))?;
.ok_or(ParseTitleError::ShortPath)?;

if root != "wiki" {
bail!("Expected 'wiki' as root path, got: {:?}", root)
return Err(ParseTitleError::BadPath);
}
let title = urlencoding::decode(title)?;

Self::from_title(&title, lang)
}

// en:Article Title
pub fn from_osm_tag(tag: &str) -> anyhow::Result<Self> {
pub fn from_osm_tag(tag: &str) -> Result<Self, ParseTitleError> {
let (lang, title) = tag
.trim()
.split_once(':')
.ok_or_else(|| anyhow!("Expected ':'"))?;
.ok_or(ParseTitleError::MissingColon)?;

let lang = lang.trim_start();
let title = title.trim_start();
Expand All @@ -100,14 +98,14 @@ impl Title {
Self::from_title(title, lang)
}

pub fn from_title(title: &str, lang: &str) -> anyhow::Result<Self> {
pub fn from_title(title: &str, lang: &str) -> Result<Self, ParseTitleError> {
let title = title.trim();
let lang = lang.trim();
if title.is_empty() {
bail!("title cannot be empty or whitespace");
return Err(ParseTitleError::NoTitle);
}
if lang.is_empty() {
bail!("lang cannot be empty or whitespace");
return Err(ParseTitleError::NoLang);
}
let name = Self::normalize_title(title);
let lang = lang.to_owned();
Expand All @@ -124,3 +122,29 @@ impl Title {
path
}
}

#[derive(Debug, PartialEq, Eq, thiserror::Error)]
pub enum ParseTitleError {
#[error("title cannot be empty or whitespace")]
NoTitle,
#[error("lang cannot be empty or whitespace")]
NoLang,
#[error("no ':' separating lang and title")]
MissingColon,

// url-specific
#[error("cannot parse url")]
Url(#[from] url::ParseError),
#[error("cannot decode url")]
UrlDecode(#[from] FromUtf8Error),
#[error("no host in url")]
NoHost,
#[error("no subdomain in url")]
NoSubdomain,
#[error("url base domain is wikipedia.org")]
BadDomain,
#[error("url base path is not /wiki/")]
BadPath,
#[error("path has less than 2 segments")]
ShortPath,
}

0 comments on commit 941d2b1

Please sign in to comment.