From 8ec696ceae9a64c24a8e09f2fecac3dcf6eff45c Mon Sep 17 00:00:00 2001 From: Evan Lloyd New-Schmidt Date: Thu, 29 Jun 2023 15:41:03 -0400 Subject: [PATCH] Remove images and links See #11 for next steps Signed-off-by: Evan Lloyd New-Schmidt --- Cargo.lock | 1 + Cargo.toml | 1 + README.md | 4 ++ src/bin/simplify_html.rs | 5 ++ src/html.rs | 102 ++++++++++++++++++++++++++++++++++----- src/main.rs | 2 + 6 files changed, 103 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c814b23..8aa7b59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -522,6 +522,7 @@ version = "0.0.0" dependencies = [ "anyhow", "clap", + "ego-tree", "env_logger", "log", "once_cell", diff --git a/Cargo.toml b/Cargo.toml index 09f05f4..1e790b4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ default-run = "om-wikiparser" [dependencies] anyhow = { version = "1.0.71", features = ["backtrace"] } clap = { version = "4.3.2", features = ["derive"] } +ego-tree = "0.6.2" env_logger = "0.10.0" log = "0.4.18" once_cell = "1.18.0" diff --git a/README.md b/README.md index 4c8a954..b7d1889 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,10 @@ As an example of usage with the map generator: # Transform intermediate files from generator. cut -f 2 id_to_wikidata.csv > wikidata_ids.txt tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt +# Enable backtraces in errors and panics. +export RUST_BACKTRACE=1 +# Set log level to debug +export RUST_LOG=om_wikiparser=debug # Begin extraction. for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz do diff --git a/src/bin/simplify_html.rs b/src/bin/simplify_html.rs index 54fae4e..6e66e9e 100644 --- a/src/bin/simplify_html.rs +++ b/src/bin/simplify_html.rs @@ -7,6 +7,11 @@ use std::io::{stdin, stdout, Read, Write}; use om_wikiparser::html::simplify; fn main() -> anyhow::Result<()> { + env_logger::Builder::new() + .filter_level(log::LevelFilter::Info) + .parse_default_env() + .try_init()?; + let mut input = String::new(); stdin().read_to_string(&mut input)?; diff --git a/src/html.rs b/src/html.rs index 3c8d185..24d1d4f 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,5 +1,6 @@ use std::collections::{BTreeMap, BTreeSet}; +use ego_tree::NodeId; use once_cell::sync::Lazy; use scraper::{ElementRef, Html, Selector}; use serde::Deserialize; @@ -51,34 +52,65 @@ pub fn simplify(html: &str, lang: &str) -> String { } } - for id in to_remove.drain(..) { - if let Some(mut node) = document.tree.get_mut(id) { - node.detach(); - } - } + remove_ids(&mut document, to_remove.drain(..)); } else { warn!("No sections to remove configured for lang {lang:?}"); } - // Remove elements with no text that isn't whitespace. - - for element in document + for el in document .root_element() .descendants() .filter_map(ElementRef::wrap) { - if element.text().all(|t| t.trim().is_empty()) { - to_remove.push(element.id()); + if is_image(&el) || is_empty_or_whitespace(&el) { + to_remove.push(el.id()); } } + remove_ids(&mut document, to_remove.drain(..)); - for id in to_remove.drain(..) { + remove_links(&mut document); + + document.html() +} + +fn remove_ids(document: &mut Html, ids: impl IntoIterator) { + for id in ids { if let Some(mut node) = document.tree.get_mut(id) { node.detach(); } } +} - document.html() +fn is_empty_or_whitespace(el: &ElementRef) -> bool { + el.text().flat_map(str::chars).all(char::is_whitespace) +} + +fn is_image(el: &ElementRef) -> bool { + ["img", "picture"].contains(&el.value().name()) +} + +/// Remove all links, preserving any inner elements/text. +fn remove_links(document: &mut Html) { + let links: Vec<_> = document + .select(&Selector::parse("a").unwrap()) + .map(|el| el.id()) + .collect(); + + for id in links { + let Some(mut node) = document.tree.get_mut(id) else { continue }; + if node.parent().is_none() { + continue; + } + + // reparent to same location as node + while let Some(mut child) = node.first_child() { + let child_id = child.id(); + child.detach(); + node.insert_id_before(child_id); + } + + node.detach(); + } } #[cfg(test)] @@ -89,4 +121,50 @@ mod test { fn static_config_parses() { assert!(!CONFIG.sections_to_remove.is_empty()); } + + #[test] + fn remove_links() { + let html = r#" +

Some text that includes + several + relative links + and + an absolute link + . +

+ "#; + + let anchors = Selector::parse("a").unwrap(); + let inner_element = Selector::parse("#inner-content").unwrap(); + let second_link = Selector::parse("#second-link").unwrap(); + + let mut document = Html::parse_fragment(html); + let links: Vec<_> = document + .select(&anchors) + .filter_map(|el| el.value().attr("href")) + .collect(); + + eprintln!("{}", document.html()); + + assert_eq!( + vec!["Some_Page", "./Another_Page", "https://example.com/page"], + links, + "Links in original html are not expected." + ); + + // Detach one of the links from the root tree (as if previously deleted) to ensure it handles orphan nodes nicely. + let link = document.select(&second_link).next().unwrap().id(); + document.tree.get_mut(link).unwrap().detach(); + + super::remove_links(&mut document); + + let links: Vec<_> = document.select(&anchors).collect(); + + assert!(links.is_empty(), "All links should be removed."); + + assert!( + document.select(&inner_element).next().is_some(), + "Link inner elements should be preserved." + ); + } } diff --git a/src/main.rs b/src/main.rs index 30b41aa..4339153 100644 --- a/src/main.rs +++ b/src/main.rs @@ -138,6 +138,8 @@ fn write( } fn main() -> anyhow::Result<()> { + // Use info level by default, load overrides from `RUST_LOG` env variable. + // See https://docs.rs/env_logger/latest/env_logger/index.html#example env_logger::Builder::new() .filter_level(log::LevelFilter::Info) .parse_default_env()