From 8ec696ceae9a64c24a8e09f2fecac3dcf6eff45c Mon Sep 17 00:00:00 2001
From: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
Date: Thu, 29 Jun 2023 15:41:03 -0400
Subject: [PATCH] Remove images and links

See #11 for next steps

Signed-off-by: Evan Lloyd New-Schmidt <evan@new-schmidt.com>
---
 Cargo.lock               |   1 +
 Cargo.toml               |   1 +
 README.md                |   4 ++
 src/bin/simplify_html.rs |   5 ++
 src/html.rs              | 102 ++++++++++++++++++++++++++++++++++-----
 src/main.rs              |   2 +
 6 files changed, 103 insertions(+), 12 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index c814b23..8aa7b59 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -522,6 +522,7 @@ version = "0.0.0"
 dependencies = [
  "anyhow",
  "clap",
+ "ego-tree",
  "env_logger",
  "log",
  "once_cell",
diff --git a/Cargo.toml b/Cargo.toml
index 09f05f4..1e790b4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,6 +10,7 @@ default-run = "om-wikiparser"
 [dependencies]
 anyhow = { version = "1.0.71", features = ["backtrace"] }
 clap = { version = "4.3.2", features = ["derive"] }
+ego-tree = "0.6.2"
 env_logger = "0.10.0"
 log = "0.4.18"
 once_cell = "1.18.0"
diff --git a/README.md b/README.md
index 4c8a954..b7d1889 100644
--- a/README.md
+++ b/README.md
@@ -35,6 +35,10 @@ As an example of usage with the map generator:
 # Transform intermediate files from generator.
 cut -f 2 id_to_wikidata.csv > wikidata_ids.txt
 tail -n +2 wiki_urls.txt | cut -f 3 > wikipedia_urls.txt
+# Enable backtraces in errors and panics.
+export RUST_BACKTRACE=1
+# Set log level to debug
+export RUST_LOG=om_wikiparser=debug
 # Begin extraction.
 for dump in $DUMP_DOWNLOAD_DIR/*-ENTERPRISE-HTML.json.tar.gz
 do
diff --git a/src/bin/simplify_html.rs b/src/bin/simplify_html.rs
index 54fae4e..6e66e9e 100644
--- a/src/bin/simplify_html.rs
+++ b/src/bin/simplify_html.rs
@@ -7,6 +7,11 @@ use std::io::{stdin, stdout, Read, Write};
 use om_wikiparser::html::simplify;
 
 fn main() -> anyhow::Result<()> {
+    env_logger::Builder::new()
+        .filter_level(log::LevelFilter::Info)
+        .parse_default_env()
+        .try_init()?;
+
     let mut input = String::new();
     stdin().read_to_string(&mut input)?;
 
diff --git a/src/html.rs b/src/html.rs
index 3c8d185..24d1d4f 100644
--- a/src/html.rs
+++ b/src/html.rs
@@ -1,5 +1,6 @@
 use std::collections::{BTreeMap, BTreeSet};
 
+use ego_tree::NodeId;
 use once_cell::sync::Lazy;
 use scraper::{ElementRef, Html, Selector};
 use serde::Deserialize;
@@ -51,34 +52,65 @@ pub fn simplify(html: &str, lang: &str) -> String {
             }
         }
 
-        for id in to_remove.drain(..) {
-            if let Some(mut node) = document.tree.get_mut(id) {
-                node.detach();
-            }
-        }
+        remove_ids(&mut document, to_remove.drain(..));
     } else {
         warn!("No sections to remove configured for lang {lang:?}");
     }
 
-    // Remove elements with no text that isn't whitespace.
-
-    for element in document
+    for el in document
         .root_element()
         .descendants()
         .filter_map(ElementRef::wrap)
     {
-        if element.text().all(|t| t.trim().is_empty()) {
-            to_remove.push(element.id());
+        if is_image(&el) || is_empty_or_whitespace(&el) {
+            to_remove.push(el.id());
         }
     }
+    remove_ids(&mut document, to_remove.drain(..));
 
-    for id in to_remove.drain(..) {
+    remove_links(&mut document);
+
+    document.html()
+}
+
+fn remove_ids(document: &mut Html, ids: impl IntoIterator<Item = NodeId>) {
+    for id in ids {
         if let Some(mut node) = document.tree.get_mut(id) {
             node.detach();
         }
     }
+}
 
-    document.html()
+fn is_empty_or_whitespace(el: &ElementRef) -> bool {
+    el.text().flat_map(str::chars).all(char::is_whitespace)
+}
+
+fn is_image(el: &ElementRef) -> bool {
+    ["img", "picture"].contains(&el.value().name())
+}
+
+/// Remove all links, preserving any inner elements/text.
+fn remove_links(document: &mut Html) {
+    let links: Vec<_> = document
+        .select(&Selector::parse("a").unwrap())
+        .map(|el| el.id())
+        .collect();
+
+    for id in links {
+        let Some(mut node) = document.tree.get_mut(id) else { continue };
+        if node.parent().is_none() {
+            continue;
+        }
+
+        // reparent to same location as node
+        while let Some(mut child) = node.first_child() {
+            let child_id = child.id();
+            child.detach();
+            node.insert_id_before(child_id);
+        }
+
+        node.detach();
+    }
 }
 
 #[cfg(test)]
@@ -89,4 +121,50 @@ mod test {
     fn static_config_parses() {
         assert!(!CONFIG.sections_to_remove.is_empty());
     }
+
+    #[test]
+    fn remove_links() {
+        let html = r#"
+        <p> Some text that includes
+            <a href="Some_Page"><span id="inner-content">several</span></a>
+            <a id="second-link" href="./Another_Page">relative links</a>
+        and
+            <a href="https://example.com/page">an absolute link</a>
+        .
+        </p>
+        "#;
+
+        let anchors = Selector::parse("a").unwrap();
+        let inner_element = Selector::parse("#inner-content").unwrap();
+        let second_link = Selector::parse("#second-link").unwrap();
+
+        let mut document = Html::parse_fragment(html);
+        let links: Vec<_> = document
+            .select(&anchors)
+            .filter_map(|el| el.value().attr("href"))
+            .collect();
+
+        eprintln!("{}", document.html());
+
+        assert_eq!(
+            vec!["Some_Page", "./Another_Page", "https://example.com/page"],
+            links,
+            "Links in original html are not expected."
+        );
+
+        // Detach one of the links from the root tree (as if previously deleted) to ensure it handles orphan nodes nicely.
+        let link = document.select(&second_link).next().unwrap().id();
+        document.tree.get_mut(link).unwrap().detach();
+
+        super::remove_links(&mut document);
+
+        let links: Vec<_> = document.select(&anchors).collect();
+
+        assert!(links.is_empty(), "All links should be removed.");
+
+        assert!(
+            document.select(&inner_element).next().is_some(),
+            "Link inner elements should be preserved."
+        );
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index 30b41aa..4339153 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -138,6 +138,8 @@ fn write(
 }
 
 fn main() -> anyhow::Result<()> {
+    // Use info level by default, load overrides from `RUST_LOG` env variable.
+    // See https://docs.rs/env_logger/latest/env_logger/index.html#example
     env_logger::Builder::new()
         .filter_level(log::LevelFilter::Info)
         .parse_default_env()