From 2c9ee239e8da22a0d8c4c09e4487f49ac9b46711 Mon Sep 17 00:00:00 2001 From: andremacola Date: Fri, 18 Oct 2024 05:25:31 -0300 Subject: [PATCH 1/2] Improvements to find dates --- src/utils/extractMetaData.js | 6 +++ src/utils/extractMetaData.test.js | 26 +++++++++ src/utils/findDate.js | 57 ++++++++++++++++++++ test-data/regular-article-date-itemprop.html | 57 ++++++++++++++++++++ test-data/regular-article-date-span.html | 57 ++++++++++++++++++++ test-data/regular-article-date-time.html | 57 ++++++++++++++++++++ 6 files changed, 260 insertions(+) create mode 100644 src/utils/findDate.js create mode 100644 test-data/regular-article-date-itemprop.html create mode 100644 test-data/regular-article-date-span.html create mode 100644 test-data/regular-article-date-time.html diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js index ede5f9b..88c19c4 100644 --- a/src/utils/extractMetaData.js +++ b/src/utils/extractMetaData.js @@ -2,6 +2,7 @@ import { DOMParser } from 'linkedom' import extractLdSchema from './extractLdSchema.js' +import findDate from './findDate.js' /** * @param {Element} node @@ -143,5 +144,10 @@ export default (html) => { }) const entries = extractLdSchema(doc, entry) + + if (!entries.published) { + entries.published = findDate(doc) + } + return entries } diff --git a/src/utils/extractMetaData.test.js b/src/utils/extractMetaData.test.js index 635acc0..f200d87 100644 --- a/src/utils/extractMetaData.test.js +++ b/src/utils/extractMetaData.test.js @@ -10,6 +10,12 @@ import extractMetaData from './extractMetaData.js' const keys = 'url shortlink amphtml canonical title description image author source published favicon type'.split(' ') +function isDateString (date) { + if (typeof date !== 'string') return false + const d = new Date(date) + return !isNaN(d.getTime()) +} + describe('test extractMetaData', () => { it('test extractMetaData(good content)', async () => { const html = readFileSync('./test-data/regular-article.html', 'utf8') @@ -28,4 +34,24 @@ describe('test extractMetaData', () => { assert.ok(hasProperty(result, k)) }) }) + + it('test extractMetaData(find date)', async () => { + const html1 = readFileSync('./test-data/regular-article-date-time.html', 'utf8') + const html2 = readFileSync('./test-data/regular-article-date-itemprop.html', 'utf8') + const html3 = readFileSync('./test-data/regular-article-date-span.html', 'utf8') + const result1 = extractMetaData(html1) + const result2 = extractMetaData(html2) + const result3 = extractMetaData(html3) + assert.ok(isObject(result1)) + assert.ok(isObject(result2)) + assert.ok(isObject(result3)) + keys.forEach((k) => { + assert.ok(hasProperty(result1, k)) + assert.ok(hasProperty(result3, k)) + assert.ok(hasProperty(result3, k)) + }) + assert.ok(isDateString(result1.published)) + assert.ok(isDateString(result2.published)) + assert.ok(isDateString(result3.published)) + }) }) diff --git a/src/utils/findDate.js b/src/utils/findDate.js new file mode 100644 index 0000000..fd617dd --- /dev/null +++ b/src/utils/findDate.js @@ -0,0 +1,57 @@ + +/** + * Convert date format to YYYY-MM-DD + * + * @param {string} dateString + * @returns {string} YYYY-MM-DD + */ +function convertDateFormat (dateString) { + const parts = dateString.split('/') + if (parts.length !== 3) return dateString + + let year, month, day + + if (parseInt(parts[0]) > 12) { + [day, month, year] = parts + } else { + [month, day, year] = parts + } + + year = year.length === 2 ? '20' + year : year + return `${year}-${month.padStart(2, '0')}-${day.padStart(2, '0')}T00:00:00` +} + +/** + * Look for the publication date in the body of the content. + * + * @param {Document} document - The HTML Document + * @returns {string} The date string + */ +export default function (doc) { + const datePatterns = [ + /\d{4}-\d{2}-\d{2}/, + /\d{1,2}\/\d{1,2}\/\d{2,4}/, + ] + + const findDate = (element) => { + for (const pattern of datePatterns) { + const match = element.textContent.match(pattern) + if (match) return convertDateFormat(match[0]) + } + return null + } + + const priorityElements = doc.querySelectorAll('time, [datetime], [itemprop~=datePublished], [itemprop~=dateCreated]') + for (const el of priorityElements) { + const date = el.getAttribute('datetime') || el.getAttribute('content') || findDate(el) + if (date) return date + } + + const secondaryElements = doc.querySelectorAll('p, span, div') + for (const el of secondaryElements) { + const date = findDate(el) + if (date) return date + } + + return null +} diff --git a/test-data/regular-article-date-itemprop.html b/test-data/regular-article-date-itemprop.html new file mode 100644 index 0000000..c23a4e9 --- /dev/null +++ b/test-data/regular-article-date-itemprop.html @@ -0,0 +1,57 @@ + + + + + + Article title here - ArticleParser + + + + + + + + + + + + + + + + + + + + + + + + + +
Page header here
+
+
+ +
+
+

Article title here

+
+ + +
Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.
+

+ Those cheetahs are nothing more than dogs. A watermelon is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.

+

The first fair dog is, in its own way, a lemon.

+
4746 Kelly Drive, West Virginia
+ +
+
+ +
+ + + diff --git a/test-data/regular-article-date-span.html b/test-data/regular-article-date-span.html new file mode 100644 index 0000000..e6c13fc --- /dev/null +++ b/test-data/regular-article-date-span.html @@ -0,0 +1,57 @@ + + + + + + Article title here - ArticleParser + + + + + + + + + + + + + + + + + + + + + + + + + +
Page header here
+
+
+ +
+
+

Article title here

+
+ Published at 11/09/2024 07h33 am + +
Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.
+

+ Those cheetahs are nothing more than dogs. A watermelon is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.

+

The first fair dog is, in its own way, a lemon.

+
4746 Kelly Drive, West Virginia
+ +
+
+ +
+ + + diff --git a/test-data/regular-article-date-time.html b/test-data/regular-article-date-time.html new file mode 100644 index 0000000..d1e1638 --- /dev/null +++ b/test-data/regular-article-date-time.html @@ -0,0 +1,57 @@ + + + + + + Article title here - ArticleParser + + + + + + + + + + + + + + + + + + + + + + + + + +
Page header here
+
+
+ +
+
+

Article title here

+
+ + +
Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.
+

+ Those cheetahs are nothing more than dogs. A watermelon is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.

+

The first fair dog is, in its own way, a lemon.

+
4746 Kelly Drive, West Virginia
+ +
+
+ +
+ + + From cc89afcae78658cb01c41fe9f28a82b955276955 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Fri, 18 Oct 2024 23:16:06 +0700 Subject: [PATCH 2/2] v8.0.13 - Merge pr #405 by @andremacola --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 7ed222e..af8403d 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "8.0.12", + "version": "8.0.13", "name": "@extractus/article-extractor", "description": "To extract main article from given URL", "homepage": "https://github.com/extractus/article-extractor",