Skip to content

Commit

Permalink
Remove link rewriting
Browse files Browse the repository at this point in the history
See #11 for next steps

Signed-off-by: Evan Lloyd New-Schmidt <[email protected]>
  • Loading branch information
newsch committed Jun 30, 2023
1 parent 93b968a commit ab3ea55
Show file tree
Hide file tree
Showing 3 changed files with 1 addition and 90 deletions.
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ anyhow = { version = "1.0.71", features = ["backtrace"] }
clap = { version = "4.3.2", features = ["derive"] }
ego-tree = "0.6.2"
env_logger = "0.10.0"
html5ever = "0.26.0"
log = "0.4.18"
once_cell = "1.18.0"
scraper = "0.16.0"
Expand Down
89 changes: 1 addition & 88 deletions src/html.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
use std::collections::{BTreeMap, BTreeSet};

use ego_tree::NodeId;
use html5ever::{LocalName, Namespace, QualName};
use once_cell::sync::Lazy;
use scraper::{ElementRef, Html, Node, Selector};
use scraper::{ElementRef, Html, Selector};
use serde::Deserialize;
use url::{ParseError::RelativeUrlWithoutBase, Url};

#[derive(Debug, Deserialize)]
struct Config<'a> {
Expand Down Expand Up @@ -72,11 +70,6 @@ pub fn simplify(html: &str, lang: &str) -> String {
}
remove_ids(&mut document, to_remove.drain(..));

// Final transformations.

let base_url = Url::parse(&format!("https://{}.wikipedia.org/wiki/", &lang)).unwrap();
fix_relative_urls(&mut document, base_url);

document.html()
}

Expand All @@ -96,41 +89,6 @@ fn is_image(el: &ElementRef) -> bool {
["img", "picture"].contains(&el.value().name())
}

/// Convert relative links to absolute links that start with `base`.
///
/// In the dumps, links within the same language's wiki are written as relative
/// links (`./Article_Title`), while cross-wiki links (e.g. `[[:fr:Pomme]]`) are
/// already expanded to absolute links (`https://fr.wikipedia.org/wiki/Pomme`).
///
/// See https://en.wikipedia.org/wiki/Help:Interwiki_linking for more examples of wikitext linking.
fn fix_relative_urls(document: &mut Html, base: Url) {
let href = QualName::new(None, Namespace::from(""), LocalName::from("href"));

for element in document.tree.values_mut().filter_map(|n| match n {
Node::Element(el) if el.name() == "a" => Some(el),
_ => None,
}) {
if let Some(url) = element.attrs.get_mut(&href) {
match Url::parse(url) {
Ok(_url) => continue,
Err(RelativeUrlWithoutBase) => match base.join(url) {
Ok(absolute) => {
*url = absolute.to_string().into();
}
Err(e) => {
warn!("Error joining relative url: {:?}: {:#}", url, e);
continue;
}
},
Err(e) => {
warn!("Error parsing url: {:?}: {:#}", url, e);
continue;
}
}
}
}
}

#[cfg(test)]
mod test {
use super::*;
Expand All @@ -139,49 +97,4 @@ mod test {
fn static_config_parses() {
assert!(!CONFIG.sections_to_remove.is_empty());
}

#[test]
fn fix_relative_urls() {
let html = r#"
<p> Some text that includes
<a href="Some_Page">several</a>
<a href="./Another_Page">relative links</a>
and
<a href="https://example.com/page">an absolute link</a>
.
</p>
"#;

let anchors = Selector::parse("a").unwrap();

let mut document = Html::parse_fragment(html);
let links: Vec<_> = document
.select(&anchors)
.filter_map(|el| el.value().attr("href"))
.collect();

assert_eq!(
vec!["Some_Page", "./Another_Page", "https://example.com/page"],
links
);

super::fix_relative_urls(
&mut document,
Url::parse("http://example.invalid/foo/").unwrap(),
);

let links: Vec<_> = document
.select(&anchors)
.filter_map(|el| el.value().attr("href"))
.collect();

assert_eq!(
vec![
"http://example.invalid/foo/Some_Page",
"http://example.invalid/foo/Another_Page",
"https://example.com/page"
],
links
);
}
}

0 comments on commit ab3ea55

Please sign in to comment.