Skip to content

Commit

Permalink
feat: Keep headings if they use the KEEP_CLASS
Browse files Browse the repository at this point in the history
  • Loading branch information
jocmp committed Jan 14, 2025
1 parent bbf5434 commit 581cae0
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 8 deletions.
5 changes: 1 addition & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
# Mercury Parser Changelog

### v2.2.11 (Jan 13, 2025)

- [68e9b88a8d] - fix: androidauthority.com - Retain h3 tags (Josiah Campbell) [#41](https://github.com/jocmp/mercury-parser/pull/41)
- [c2bc68449f] - bump version v2.2.9 -> v2.2.10 (jocmp)
### v2.3.0

### v2.2.10 (Jan 11, 2025)

Expand Down
6 changes: 3 additions & 3 deletions src/extractors/custom/www.androidauthority.com/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ export const WwwAndroidauthorityComExtractor = {
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
// the empty paragraph will be removed anyway.
content: {
selectors: ['.d_Dd'],
selectors: ['.d_Dd', '.e_Ac'],
transforms: {
ol: node => {
node.attr('class', 'mercury-parser-keep');
},
h2: $node => $node.before('<p></p>'),
h3: $node => $node.before('<p></p>'),
h2: $node => $node.attr('class', 'mercury-parser-keep'),
h3: $node => $node.attr('class', 'mercury-parser-keep'),
},
clean: [
'.d_f .d_nr', // Lead image
Expand Down
7 changes: 6 additions & 1 deletion src/utils/dom/clean-headers.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import { getWeight } from 'extractors/generic/content/scoring';

import { HEADER_TAG_LIST } from './constants';
import { HEADER_TAG_LIST, KEEP_CLASS } from './constants';
import { normalizeSpaces } from '../text';

export default function cleanHeaders($article, $, title = '') {
$(HEADER_TAG_LIST, $article).each((index, header) => {
const $header = $(header);

if ($(header).hasClass(KEEP_CLASS)) {
return $header;
}

// Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
Expand Down
20 changes: 20 additions & 0 deletions src/utils/dom/clean-headers.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,24 @@ describe('cleanHeaders(article, $)', () => {
`
);
});

it('keeps headers with keep class', () => {
const $ = cheerio.load(`
<div>
<h3 class="mercury-parser-keep">Keep me</h3>
<p>What do you think?</p>
</div>
`);

const result = cleanHeaders($('*').first(), $);
assertClean(
result.html(),
`
<div>
<h3 class="mercury-parser-keep">Keep me</h3>
<p>What do you think?</p>
</div>
`
);
});
});

0 comments on commit 581cae0

Please sign in to comment.