From 50b96783c5ca8f39de5d355ed0efb992bf3461db Mon Sep 17 00:00:00 2001 From: Gustavo Lima Chaves Date: Sun, 21 Apr 2024 11:06:10 -0700 Subject: [PATCH] rss: ingest atom/rss feed contents already decoding HTML special chars Without this move, the main UI could show info and feed entries with crude HTML marks, as in "“". Translating that at the source is the best move, since they get stored right in the DB. Signed-off-by: Gustavo Lima Chaves --- Cargo.lock | 16 ++++++++++++++++ Cargo.toml | 1 + src/rss.rs | 47 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index aeb18c0..149b336 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -602,6 +602,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html2text" version = "0.12.0" @@ -1248,6 +1257,7 @@ dependencies = [ "crossterm", "diligent-date-parser", "directories", + "html-escape", "html2text", "num_cpus", "opml", @@ -1693,6 +1703,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" + [[package]] name = "utf8parse" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index 48ff600..563a483 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ crossterm = "0.27" diligent-date-parser = "0.1" directories = "5" html2text = "0.12" +html-escape = "0.2.13" num_cpus = "1.16" opml = "1.1" r2d2 = "0.8" diff --git a/src/rss.rs b/src/rss.rs index 3816981..8976220 100644 --- a/src/rss.rs +++ b/src/rss.rs @@ -2,6 +2,7 @@ use crate::modes::ReadMode; use anyhow::{bail, Context, Result}; use atom_syndication as atom; use chrono::prelude::{DateTime, Utc}; +use html_escape::decode_html_entities_to_string; use rss::Channel; use rusqlite::params; use rusqlite::types::ToSqlOutput; @@ -91,11 +92,25 @@ impl From<&atom::Entry> for Entry { Self { id: -1, feed_id: -1, - title: Some(entry.title().to_string()), - author: entry.authors().first().map(|author| author.name.to_owned()), + title: { + let mut title = String::new(); + decode_html_entities_to_string(entry.title(), &mut title); + Some(title) + }, + author: entry.authors().first().map(|entry_author| { + let mut author = String::new(); + decode_html_entities_to_string(&entry_author.name, &mut author); + author + }), pub_date: entry.published().map(|date| date.with_timezone(&Utc)), description: None, - content: entry.content().and_then(|content| content.value.to_owned()), + content: entry.content().and_then(|entry_content| { + entry_content.value().map(|entry_content| { + let mut content = String::new(); + decode_html_entities_to_string(entry_content, &mut content); + content + }) + }), link: entry.links().first().map(|link| link.href().to_string()), read_at: None, inserted_at: Utc::now(), @@ -109,13 +124,27 @@ impl From<&rss::Item> for Entry { Self { id: -1, feed_id: -1, - title: entry.title().map(|title| title.to_owned()), - author: entry.author().map(|author| author.to_owned()), + title: entry.title().map(|entry_title| { + let mut title = String::new(); + decode_html_entities_to_string(entry_title, &mut title); + title + }), + author: entry.author().map(|entry_author| { + let mut author = String::new(); + decode_html_entities_to_string(entry_author, &mut author); + author + }), pub_date: entry.pub_date().and_then(parse_datetime), - description: entry - .description() - .map(|description| description.to_owned()), - content: entry.content().map(|content| content.to_owned()), + description: entry.description().map(|entry_description| { + let mut description = String::new(); + decode_html_entities_to_string(entry_description, &mut description); + description + }), + content: entry.content().map(|entry_content| { + let mut content = String::new(); + decode_html_entities_to_string(entry_content, &mut content); + content + }), link: entry.link().map(|link| link.to_owned()), read_at: None, inserted_at: Utc::now(),