Skip to content

Commit

Permalink
Merge pull request #408 from extractus/8.0.14
Browse files Browse the repository at this point in the history
v8.0.14
  • Loading branch information
ndaidong authored Oct 19, 2024
2 parents 5accfa6 + 2fee686 commit 6355d23
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 16 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"version": "8.0.13",
"version": "8.0.14",
"name": "@extractus/article-extractor",
"description": "To extract main article from given URL",
"homepage": "https://github.com/extractus/article-extractor",
Expand Down
17 changes: 8 additions & 9 deletions src/utils/extractLdSchema.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
// utils -> extractLdSchema.js

import { isArray, isObject, isString } from 'bellajs'

const typeSchemas = [
'aboutpage',
'checkoutpage',
Expand Down Expand Up @@ -53,25 +57,20 @@ const parseJson = (text) => {
*/
export default (document, entry) => {
const ldSchemas = document.querySelectorAll('script[type="application/ld+json"]')

ldSchemas.forEach(ldSchema => {
const ldJson = parseJson(ldSchema.textContent.replace(/[\n\r\t]/g, ''))
const isAllowedLdJsonType = typeSchemas.includes(ldJson['@type']?.toLowerCase())

if (ldJson && isAllowedLdJsonType) {
Object.entries(attributeLists).forEach(([key, attr]) => {
const isEntryAlreadyPopulated = typeof entry[key] !== 'undefined' && entry[key] !== ''

if (isEntryAlreadyPopulated || !ldJson[attr]) {
if (!entry[key] || !ldJson[attr]) {
return
}

const keyValue = ldJson[attr]
if (keyValue) {
entry[key] = Array.isArray(keyValue) ? keyValue[0] : keyValue
if (typeof entry[key] === 'string') {
entry[key] = entry[key].toLowerCase().trim()
}
const val = isArray(keyValue) ? keyValue[0] : isObject(keyValue) ? keyValue?.name || '' : keyValue
if (isString(val)) {
entry[key] = val.trim()
}
})
}
Expand Down
8 changes: 4 additions & 4 deletions src/utils/extractMetaData.js
Original file line number Diff line number Diff line change
Expand Up @@ -143,11 +143,11 @@ export default (html) => {
}
})

const entries = extractLdSchema(doc, entry)
const metadata = extractLdSchema(doc, entry)

if (!entries.published) {
entries.published = findDate(doc)
if (!metadata.published) {
metadata.published = findDate(doc) || ''
}

return entries
return metadata
}
4 changes: 2 additions & 2 deletions src/utils/findDate.js
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ export default function (doc) {
const match = element.textContent.match(pattern)
if (match) return convertDateFormat(match[0])
}
return null
return ''
}

const priorityElements = doc.querySelectorAll('time, [datetime], [itemprop~=datePublished], [itemprop~=dateCreated]')
Expand All @@ -53,5 +53,5 @@ export default function (doc) {
if (date) return date
}

return null
return ''
}

0 comments on commit 6355d23

Please sign in to comment.