From 3c70379b0f265d886400bde8d0b6633b4c4fcbcc Mon Sep 17 00:00:00 2001 From: Gustavo Lima Chaves Date: Sat, 1 Jun 2024 16:01:48 -0700 Subject: [PATCH] rss: ingest atom/rss feed contents already decoding HTML special chars (#37) Without this move, the main UI could show info and feed entries with crude HTML marks, as in "“". Translating that at the source is the best move, since they get stored right in the DB. Signed-off-by: Gustavo Lima Chaves --- Cargo.lock | 16 ++++++++++++++++ Cargo.toml | 1 + src/rss.rs | 47 ++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a0c19c6..6722dcd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -590,6 +590,15 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + [[package]] name = "html2text" version = "0.12.5" @@ -1272,6 +1281,7 @@ dependencies = [ "crossterm", "diligent-date-parser", "directories", + "html-escape", "html2text", "num_cpus", "opml", @@ -1727,6 +1737,12 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf8-width" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" + [[package]] name = "utf8parse" version = "0.2.1" diff --git a/Cargo.toml b/Cargo.toml index a94fbeb..c0a728c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,6 +19,7 @@ crossterm = "0.27" diligent-date-parser = "0.1" directories = "5" html2text = "0.12" +html-escape = "0.2.13" num_cpus = "1.16" opml = "1.1" r2d2 = "0.8" diff --git a/src/rss.rs b/src/rss.rs index dce7ab4..e9aef23 100644 --- a/src/rss.rs +++ b/src/rss.rs @@ -5,6 +5,7 @@ use crate::modes::ReadMode; use anyhow::{bail, Context, Result}; use atom_syndication as atom; use chrono::prelude::{DateTime, Utc}; +use html_escape::decode_html_entities_to_string; use rss::Channel; use rusqlite::params; use rusqlite::types::{FromSql, ToSqlOutput}; @@ -155,11 +156,25 @@ struct IncomingEntry { impl From<&atom::Entry> for IncomingEntry { fn from(entry: &atom::Entry) -> Self { Self { - title: Some(entry.title().to_string()), - author: entry.authors().first().map(|author| author.name.to_owned()), + title: { + let mut title = String::new(); + decode_html_entities_to_string(entry.title(), &mut title); + Some(title) + }, + author: entry.authors().first().map(|entry_author| { + let mut author = String::new(); + decode_html_entities_to_string(&entry_author.name, &mut author); + author + }), pub_date: entry.published().map(|date| date.with_timezone(&Utc)), description: None, - content: entry.content().and_then(|content| content.value.to_owned()), + content: entry.content().and_then(|entry_content| { + entry_content.value().map(|entry_content| { + let mut content = String::new(); + decode_html_entities_to_string(entry_content, &mut content); + content + }) + }), link: entry.links().first().map(|link| link.href().to_string()), } } @@ -168,13 +183,27 @@ impl From<&atom::Entry> for IncomingEntry { impl From<&rss::Item> for IncomingEntry { fn from(entry: &rss::Item) -> Self { Self { - title: entry.title().map(|title| title.to_owned()), - author: entry.author().map(|author| author.to_owned()), + title: entry.title().map(|entry_title| { + let mut title = String::new(); + decode_html_entities_to_string(entry_title, &mut title); + title + }), + author: entry.author().map(|entry_author| { + let mut author = String::new(); + decode_html_entities_to_string(entry_author, &mut author); + author + }), pub_date: entry.pub_date().and_then(parse_datetime), - description: entry - .description() - .map(|description| description.to_owned()), - content: entry.content().map(|content| content.to_owned()), + description: entry.description().map(|entry_description| { + let mut description = String::new(); + decode_html_entities_to_string(entry_description, &mut description); + description + }), + content: entry.content().map(|entry_content| { + let mut content = String::new(); + decode_html_entities_to_string(entry_content, &mut content); + content + }), link: entry.link().map(|link| link.to_owned()), } }