From 57c2481884214dc9c867cd3e1e32d81235295166 Mon Sep 17 00:00:00 2001 From: Josiah Campbell <9521010+jocmp@users.noreply.github.com> Date: Tue, 14 Jan 2025 21:26:57 -0600 Subject: [PATCH] fix: Update versants.com to parse figures - Strip image carousels of thumbnail images --- CHANGELOG.md | 4 ++++ README.md | 2 -- src/extractors/custom/www.versants.com/index.js | 17 +++++++++++++++-- .../custom/www.versants.com/index.test.js | 2 +- src/utils/dom/clean-tags.js | 2 +- 5 files changed, 21 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 678d3809..13a93e16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Mercury Parser Changelog +### 2.3.0 + +- [a45b329e0a] - fix: Update versants.com to parse figures (Josiah Campbell) [#42](https://github.com/jocmp/mercury-parser/pull/42) + ### v2.2.10 (Jan 11, 2025) - [b8b4df7037] - feat: Add custom parser for mobilesyrup.com (Josiah Campbell) [#39](https://github.com/jocmp/mercury-parser/pull/39) diff --git a/README.md b/README.md index 2037edc6..9801d76a 100644 --- a/README.md +++ b/README.md @@ -106,8 +106,6 @@ Note that the URL argument is still supplied, in order to identify the web site Mercury Parser also ships with a CLI, meaning you can use it from your command line like so: -![Mercury Parser CLI Basic Usage](./assets/parser-basic-usage.gif) - ```bash # Install Mercury Parser globally yarn global add @jocmp/mercury-parser diff --git a/src/extractors/custom/www.versants.com/index.js b/src/extractors/custom/www.versants.com/index.js index 938273ad..b6fce531 100644 --- a/src/extractors/custom/www.versants.com/index.js +++ b/src/extractors/custom/www.versants.com/index.js @@ -17,7 +17,20 @@ export const WwwVersantsComExtractor = { }, content: { - selectors: ['.entry-content'], - clean: ['.adv-link', '.versa-target'], + transforms: { + '.featured-image': $node => { + $node.addClass('mercury-parser-keep'); + const figcaption = $node.find('span'); + $node.find('figure').append(figcaption); + }, + }, + selectors: ['.article-content'], + clean: [ + '.adv-link', + '.versa-target', + 'header', // Clean title + '.author', // Clean author + '.thumbnail-slider', // Remove, the main images will be within the .main-slider div. + ], }, }; diff --git a/src/extractors/custom/www.versants.com/index.test.js b/src/extractors/custom/www.versants.com/index.test.js index 8156bfc9..598021eb 100644 --- a/src/extractors/custom/www.versants.com/index.test.js +++ b/src/extractors/custom/www.versants.com/index.test.js @@ -70,7 +70,7 @@ describe('WwwVersantsComExtractor', () => { assert.equal( first13, - 'La 32e campagne d’Opération Nez rouge de la Vallée-du-Richelieu sera en vigueur durant' + "C'est à Sainte-Julie que les bénévoles de l'ONR VDR se retrouveront dès le" ); }); }); diff --git a/src/utils/dom/clean-tags.js b/src/utils/dom/clean-tags.js index c1a9eb06..a5b307ea 100644 --- a/src/utils/dom/clean-tags.js +++ b/src/utils/dom/clean-tags.js @@ -105,7 +105,7 @@ export default function cleanTags($article, $) { if (weight < 0) { $node.remove(); } else { - // deteremine if node seems like content + // determine if node seems like content removeUnlessContent($node, $, weight); } });