Skip to content

Commit

Permalink
fix: Keep headings if they use the KEEP_CLASS
Browse files Browse the repository at this point in the history
  • Loading branch information
jocmp committed Jan 15, 2025
1 parent 3692b40 commit 97a6b1e
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 4 deletions.
6 changes: 3 additions & 3 deletions src/extractors/custom/www.androidauthority.com/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ export const WwwAndroidauthorityComExtractor = {
// remove if not following a paragraph. Adding this empty paragraph fixes it, and
// the empty paragraph will be removed anyway.
content: {
selectors: ['.d_Dd'],
selectors: ['.d_Dd', '.e_Ac'],
transforms: {
ol: node => {
node.attr('class', 'mercury-parser-keep');
},
h2: $node => $node.before('<p></p>'),
h3: $node => $node.before('<p></p>'),
h2: $node => $node.attr('class', 'mercury-parser-keep'),
h3: $node => $node.attr('class', 'mercury-parser-keep'),
},
clean: [
'.d_f .d_nr', // Lead image
Expand Down
7 changes: 6 additions & 1 deletion src/utils/dom/clean-headers.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
import { getWeight } from 'extractors/generic/content/scoring';

import { HEADER_TAG_LIST } from './constants';
import { HEADER_TAG_LIST, KEEP_CLASS } from './constants';
import { normalizeSpaces } from '../text';

export default function cleanHeaders($article, $, title = '') {
$(HEADER_TAG_LIST, $article).each((index, header) => {
const $header = $(header);

if ($(header).hasClass(KEEP_CLASS)) {
return $header;
}

// Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
Expand Down
20 changes: 20 additions & 0 deletions src/utils/dom/clean-headers.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,4 +69,24 @@ describe('cleanHeaders(article, $)', () => {
`
);
});

it('keeps headers with keep class', () => {
const $ = cheerio.load(`
<div>
<h3 class="mercury-parser-keep">Keep me</h3>
<p>What do you think?</p>
</div>
`);

const result = cleanHeaders($('*').first(), $);
assertClean(
result.html(),
`
<div>
<h3 class="mercury-parser-keep">Keep me</h3>
<p>What do you think?</p>
</div>
`
);
});
});

0 comments on commit 97a6b1e

Please sign in to comment.