From 581cae0068ad6b401cfd1e54297e1f73145dcbf9 Mon Sep 17 00:00:00 2001 From: Josiah Campbell <9521010+jocmp@users.noreply.github.com> Date: Mon, 13 Jan 2025 22:22:23 -0600 Subject: [PATCH] feat: Keep headings if they use the KEEP_CLASS --- CHANGELOG.md | 5 +---- .../custom/www.androidauthority.com/index.js | 6 +++--- src/utils/dom/clean-headers.js | 7 ++++++- src/utils/dom/clean-headers.test.js | 20 +++++++++++++++++++ 4 files changed, 30 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b858df8..4b8f5520 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,6 @@ # Mercury Parser Changelog -### v2.2.11 (Jan 13, 2025) - -- [68e9b88a8d] - fix: androidauthority.com - Retain h3 tags (Josiah Campbell) [#41](https://github.com/jocmp/mercury-parser/pull/41) -- [c2bc68449f] - bump version v2.2.9 -> v2.2.10 (jocmp) +### v2.3.0 ### v2.2.10 (Jan 11, 2025) diff --git a/src/extractors/custom/www.androidauthority.com/index.js b/src/extractors/custom/www.androidauthority.com/index.js index 7b19305a..e67784d3 100644 --- a/src/extractors/custom/www.androidauthority.com/index.js +++ b/src/extractors/custom/www.androidauthority.com/index.js @@ -20,13 +20,13 @@ export const WwwAndroidauthorityComExtractor = { // remove if not following a paragraph. Adding this empty paragraph fixes it, and // the empty paragraph will be removed anyway. content: { - selectors: ['.d_Dd'], + selectors: ['.d_Dd', '.e_Ac'], transforms: { ol: node => { node.attr('class', 'mercury-parser-keep'); }, - h2: $node => $node.before('

'), - h3: $node => $node.before('

'), + h2: $node => $node.attr('class', 'mercury-parser-keep'), + h3: $node => $node.attr('class', 'mercury-parser-keep'), }, clean: [ '.d_f .d_nr', // Lead image diff --git a/src/utils/dom/clean-headers.js b/src/utils/dom/clean-headers.js index 2db0ddd2..049a7c79 100644 --- a/src/utils/dom/clean-headers.js +++ b/src/utils/dom/clean-headers.js @@ -1,11 +1,16 @@ import { getWeight } from 'extractors/generic/content/scoring'; -import { HEADER_TAG_LIST } from './constants'; +import { HEADER_TAG_LIST, KEEP_CLASS } from './constants'; import { normalizeSpaces } from '../text'; export default function cleanHeaders($article, $, title = '') { $(HEADER_TAG_LIST, $article).each((index, header) => { const $header = $(header); + + if ($(header).hasClass(KEEP_CLASS)) { + return $header; + } + // Remove any headers that appear before all other p tags in the // document. This probably means that it was part of the title, a // subtitle or something else extraneous like a datestamp or byline, diff --git a/src/utils/dom/clean-headers.test.js b/src/utils/dom/clean-headers.test.js index e330b3ee..c9abecf1 100644 --- a/src/utils/dom/clean-headers.test.js +++ b/src/utils/dom/clean-headers.test.js @@ -69,4 +69,24 @@ describe('cleanHeaders(article, $)', () => { ` ); }); + + it('keeps headers with keep class', () => { + const $ = cheerio.load(` +
+

Keep me

+

What do you think?

+
+ `); + + const result = cleanHeaders($('*').first(), $); + assertClean( + result.html(), + ` +
+

Keep me

+

What do you think?

+
+ ` + ); + }); });