Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(marticle): updates a use case where ul nodes could contain spans and be within a div #934

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
245 changes: 245 additions & 0 deletions servers/parser-graphql-wrapper/src/marticle/marticleParser.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,251 @@ describe('MarticleParser', () => {
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in the ul and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<span>test</span>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
{
__typename: 'UnMarseable',
html: '<span>test</span>',
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a nested ul and span after li and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<ul>
<li>1-2-a</li>
<span>test</span>
</ul>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The schema doesn't allow returning unmarseable in lists so we'll have to update the caller function to remove this or pop it out

level: 1,
content: '1-2-a',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a nested ul and span before li and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<ul>
<span>test</span>
<li>1-2-a</li>
</ul>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
level: 1,
content: '1-2-a',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a ul and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<span>test</span>
<li>1-2.</li>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{
level: 0,
content: '1-2.',
},
{
level: 0,
content: '2-2.',
},
{
level: 0,
content: '3-2.',
},
{
level: 0,
content: '4-2.',
},
{
level: 0,
content: '5-2.',
},
{
level: 0,
content: '6-2.',
},
],
},
{
__typename: 'UnMarseable',
html: '<span>test</span>',
},
];
expect(res).toEqual(expected);
});

it('should parse unorded lists within a div, that has invalid data in a nested ul and return unmarseable', () => {
const input = `<div lang="en">
<ul>
<li>1-2.</li>
<ul>
<li>1-2-a-i</li>
<span>test</span>
<li>1-2-a</li>
</ul>
<li>2-2.</li>
<li>3-2.</li>
<li>4-2.</li>
<li>5-2.</li>
<li>6-2.</li>
</ul>
</div>`;
const res = marticleParser.parse(input);
const expected = [
{
__typename: 'MarticleBulletedList',
rows: [
{ level: 0, content: '1-2.' },
{ level: 1, content: '1-2-a-i' },
{ level: 1, content: '1-2-a' },
{ level: 0, content: '2-2.' },
{ level: 0, content: '3-2.' },
{ level: 0, content: '4-2.' },
{ level: 0, content: '5-2.' },
{ level: 0, content: '6-2.' },
],
},
];
expect(res).toEqual(expected);
});

it('should parse rogue <li>s', () => {
const input =
'<div>' +
Expand Down
97 changes: 65 additions & 32 deletions servers/parser-graphql-wrapper/src/marticle/marticleParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import turndownService from './turndown';
import TurndownService from 'turndown';
import { config } from './config';
import { serverLogger } from '@pocket-tools/ts-logger';
import { isArray } from 'node:util';

export const videoTypeMap = {
1: VideoType.Youtube,
Expand Down Expand Up @@ -109,6 +110,24 @@ const unMarseableTransformers = unMarseableComponents.reduce(
{},
);

// Transformer for when there is an error processing a list element
function listErrorTransformer(root: Node): UnMarseable {
let html = '';
if (isArray(root)) {
root.forEach((node: Node) => {
html += (node as Element).outerHTML;
node.parentNode.removeChild(node);
});
} else {
html += (root as Element).outerHTML;
root.parentNode.removeChild(root);
}
return {
__typename: 'UnMarseable',
html: html.trim(),
};
}

// Methods for transforming a subtree of the DOM that represents
// an article into one or more MarticleComponents.
// To avoid many if/else statements, create a map of root tag
Expand Down Expand Up @@ -187,41 +206,55 @@ const transformers = {
// Lists can be broken up, so the transformer can return any kind
// of Marticle* component ( + lists).
// Kind of cheating on types for documentation purposes
UL: (root: Node, article: Item): MarticleElement[] => {
const { output, aggFrom } = listTransformer(
root,
[],
'UL',
undefined,
article,
);
// Result might contain rows that need to be aggregated into a single
// MarticleBulletedList
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as ListElement[];
output.push({
__typename: 'MarticleBulletedList',
rows: aggOutput,
});
UL: (root: Node, article: Item): MarticleElement[] | UnMarseable => {
try {
const { output, aggFrom } = listTransformer(
root,
[],
'UL',
undefined,
article,
);
// Result might contain rows that need to be aggregated into a single
// MarticleBulletedList
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as ListElement[];
output.push({
__typename: 'MarticleBulletedList',
rows: aggOutput.filter(
(row) => !Object.keys(row).includes('__typename'), // remove unmarseable rows
),
});
}
return output as MarticleElement[];
} catch (err) {
serverLogger.error('Error processing UL list', { item: article, err });
return listErrorTransformer(root);
}
return output as MarticleElement[];
},
OL: (root: Node, article: Item): MarticleElement[] => {
const { output, aggFrom } = listTransformer(
root,
[],
'OL',
undefined,
article,
);
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as NumberedListElement[];
output.push({
__typename: 'MarticleNumberedList',
rows: aggOutput,
});
OL: (root: Node, article: Item): MarticleElement[] | UnMarseable => {
try {
const { output, aggFrom } = listTransformer(
root,
[],
'OL',
undefined,
article,
);
if (aggFrom != null) {
const aggOutput = output.splice(aggFrom) as NumberedListElement[];
output.push({
__typename: 'MarticleNumberedList',
rows: aggOutput.filter(
(row) => !Object.keys(row).includes('__typename'), // remove unmarseable rows
),
});
}
return output as MarticleElement[];
} catch (err) {
serverLogger.error('Error processing OL list', { item: article, err });
return listErrorTransformer(root);
}
return output as MarticleElement[];
},
LI: (
children: Node[],
Expand Down
Loading