From 45b7c2b619e64d3095477ef5bb1c908ac379a19a Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 28 Sep 2023 17:14:17 +0800 Subject: [PATCH 1/4] get published date from time element --- packages/readabilityjs/Readability.js | 12 ++++++++++++ .../nytimes-podcasts/expected-metadata.json | 2 +- .../test/test-pages/nytimes-podcasts/expected.html | 2 -- .../test/test-pages/nytimes.com/expected.html | 2 -- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index abdbfa8d0e..65de682751 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -1081,6 +1081,18 @@ Readability.prototype = { } // we don't want to check for dates in the URL's if (node.tagName.toLowerCase() === 'a') return + // get the datetime from time element + if (node.tagName.toLowerCase() === 'time') { + const datetime = node.getAttribute('datetime') + if (datetime) { + const date = new Date(datetime) + if (!isNaN(date)) { + this._articlePublishedDate = date + return true + } + } + } + // Searching for the real date in the text content const content = node.textContent.trim() let dateFound diff --git a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json index 61fe09886c..d2bc065e6d 100644 --- a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json +++ b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json @@ -4,7 +4,7 @@ "dir": null, "excerpt": "The Sept. 27, 2022 episode of “The Ezra Klein Show”", "siteName": "fakehost", - "siteIcon": "/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico", + "siteIcon": "http://fakehost/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico", "previewImage": "https://static01.nyt.com/newsgraphics/images/icons/defaultPromoCrop.png", "publishedDate": "2022-09-27T16:25:17.221Z", "language": "English", diff --git a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html index 8d79daf8c2..725bef0943 100644 --- a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html +++ b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html @@ -5,8 +5,6 @@

The Ezra Klein Show

-

-

diff --git a/packages/readabilityjs/test/test-pages/nytimes.com/expected.html b/packages/readabilityjs/test/test-pages/nytimes.com/expected.html index ceb0aa925a..9777d24a69 100644 --- a/packages/readabilityjs/test/test-pages/nytimes.com/expected.html +++ b/packages/readabilityjs/test/test-pages/nytimes.com/expected.html @@ -30,8 +30,6 @@
-

-

From 310ad5de1df5b804298874461e9641ef26e90017 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 28 Sep 2023 17:35:33 +0800 Subject: [PATCH 2/4] get published date from url --- packages/readabilityjs/Readability.js | 24 +- .../test-pages/caixin/expected-metadata.json | 11 + .../test/test-pages/caixin/expected.html | 45 + .../test/test-pages/caixin/source.html | 2275 +++++++++++++++++ .../test/test-pages/caixin/url.txt | 1 + 5 files changed, 2354 insertions(+), 2 deletions(-) create mode 100644 packages/readabilityjs/test/test-pages/caixin/expected-metadata.json create mode 100644 packages/readabilityjs/test/test-pages/caixin/expected.html create mode 100644 packages/readabilityjs/test/test-pages/caixin/source.html create mode 100644 packages/readabilityjs/test/test-pages/caixin/url.txt diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js index 65de682751..2934841bd4 100644 --- a/packages/readabilityjs/Readability.js +++ b/packages/readabilityjs/Readability.js @@ -76,6 +76,22 @@ const extractPublishedDateFromAuthor = (author)=> { return [authorName, null]; }; +// extract published date from url if it's in the format of yyyy/mm/dd or yyyy-mm-dd +const extractPublishedDateFromUrl = (url) => { + if (!url) return null; + + const regex = /(\d{4})(\/|-)(\d{2})(\/|-)(\d{2})/i; + const match = url.match(regex); + if (match) { + const year = parseInt(match[1], 10); + const month = parseInt(match[3], 10) - 1; // January is 0 in JavaScript Date + const day = parseInt(match[5], 10); + + return new Date(year, month, day); + } + return null; +} + /** * Public constructor. * @param {Document} doc The document to parse. @@ -3068,7 +3084,11 @@ Readability.prototype = { return null; const byline = metadata.byline || this._articleByline; - const [author, publishedAt] = extractPublishedDateFromAuthor(byline); + const [author, publishedDateFromAuthor] = extractPublishedDateFromAuthor(byline); + const publishedDate = metadata.publishedDate || + extractPublishedDateFromUrl(this._documentURI) || + publishedDateFromAuthor || + this._articlePublishedDate; this._postProcessContent(articleContent); @@ -3104,7 +3124,7 @@ Readability.prototype = { siteName: metadata.siteName, siteIcon: metadata.siteIcon, previewImage: metadata.previewImage, - publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate, + publishedDate, language: this._getLanguage(metadata.locale || this._languageCode), }; } diff --git a/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json b/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json new file mode 100644 index 0000000000..b974abcc29 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json @@ -0,0 +1,11 @@ +{ + "title": "途虎养车港交所挂牌 腾讯为最大外部股东", + "byline": "文|财新 余聪", + "dir": null, + "excerpt": "途虎养车 腾讯国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%", + "siteName": "fakehost", + "previewImage": "https://img.caixin.com/2023-09-26/169572084568190_560_373.jpg", + "publishedDate": "2023-09-25T16:00:00.000Z", + "language": "English", + "readerable": true +} diff --git a/packages/readabilityjs/test/test-pages/caixin/expected.html b/packages/readabilityjs/test/test-pages/caixin/expected.html new file mode 100644 index 0000000000..feb2717e52 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/expected.html @@ -0,0 +1,45 @@ +
+
+
+

途虎养车港交所挂牌 腾讯为最大外部股东 +

+ + +
+ +

文|财新 余聪

+

2023年09月26日 17:22

+ + + +

试听

+
+

国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%

+
+
+

  【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。

+

  途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。

+
+ + +
+

+

登录 后获取已订阅的阅读权限

+ + + + + + +
+
+

+
+ + +

  推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。

+

责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)

+
+
\ No newline at end of file diff --git a/packages/readabilityjs/test/test-pages/caixin/source.html b/packages/readabilityjs/test/test-pages/caixin/source.html new file mode 100644 index 0000000000..cb9392fb1e --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/source.html @@ -0,0 +1,2275 @@ + + + + + + + + + + + + 途虎养车港交所挂牌 腾讯为最大外部股东_财新网_财新网 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+
+ + +
+
+
+ + +
财新传媒 + + +
+
+
+
+ 财新网 > 汽车 > 正文 +
+ +
+ +
+
+ +
+
+ + +
+ +
+ +
+
+
+ +
+ +
+
+ +
+ + + + +
+
+

+ 途虎养车港交所挂牌 腾讯为最大外部股东 +

+ +
+ +
+ 文|财新 余聪 +
+ 2023年09月26日 17:22 + + + 试听 +
+
+ 国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9% +
+
+
+
+
+ +
+
+ 上海,一处途虎养车门店。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。图:Qilai Shen/视觉中国 +
+
+
+
+ + +
+ + +
+

+   【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。 +

+

+   途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。 +

+
+
+
+
+ + +
+
+ +
+
+ + +
+ 登录 后获取已订阅的阅读权限 +
+
+
+ 财新通会员
+ 可畅读全文 +
订阅/会员升级 +
+
+
+
+
+
+ 请朋友免费读财新 +
+
+
+
+ +
+
+
+ + + + +
+
+
+ + +
+ +
+
+ +
+
+ +
+
+
+
+
+
+ +
+ +
+

+   推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。 +

+
+
+
+ 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028) +
+ + +
+ +
+ +
+
+ 话题: +
+
+ #港交所+关注 +
+
+ #腾讯+关注 +
+
+ #京东+关注 +
+
+
+ +
+ +
+ +
+ +
+
+ + + +
+ +
+ + + + +
+ + +
+ +
+ +
+

+ 图片推荐 +

+
+ + +
+ +
+
+ +
+
+ + + + +
+ +
+
+ +
+ +
+ +
+ +
+
+ +
+
+ + +
+
+
+ 财新网主编精选版电邮 + 样例 +
+
+ 财新网新闻版电邮全新升级!财新网主编精心编写,每个工作日定时投递,篇篇重磅,可信可引。 +
+
+ 订阅 +
+
+
+ + + +
+ + + + + + +
+

+ 视频 +

+
+ +
+
 + + + + +
+
+
+ +
+
+ + + + + + +
+ + + + + + + + + + + + + + + + +
+
+
+

+ +

+
+ +
+ + + diff --git a/packages/readabilityjs/test/test-pages/caixin/url.txt b/packages/readabilityjs/test/test-pages/caixin/url.txt new file mode 100644 index 0000000000..dbae9da787 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/url.txt @@ -0,0 +1 @@ +https://www.caixin.com/2023-09-26/102112537.html \ No newline at end of file From 001403c02dae058d5ed800e3f12f172d1973d9f8 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 28 Sep 2023 17:36:27 +0800 Subject: [PATCH 3/4] fix tests --- packages/readabilityjs/test/test-pages/caixin/expected.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/readabilityjs/test/test-pages/caixin/expected.html b/packages/readabilityjs/test/test-pages/caixin/expected.html index feb2717e52..1d85d1b14e 100644 --- a/packages/readabilityjs/test/test-pages/caixin/expected.html +++ b/packages/readabilityjs/test/test-pages/caixin/expected.html @@ -14,7 +14,7 @@

途虎养车港交所挂牌 腾讯为最大外部股东 试听

+ 试听

国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%

@@ -42,4 +42,4 @@

途虎养车港交所挂牌 腾讯为最大外部股东 财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。

责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)

- \ No newline at end of file + From 2b23b0e002cceb61f99c8d71916f3e3c3ea33932 Mon Sep 17 00:00:00 2001 From: sywhb Date: Thu, 28 Sep 2023 09:37:48 +0000 Subject: [PATCH 4/4] Update generated html --- packages/readabilityjs/test/index.html | 6 ++++++ .../test/test-pages/caixin/distiller.html | 20 +++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 packages/readabilityjs/test/test-pages/caixin/distiller.html diff --git a/packages/readabilityjs/test/index.html b/packages/readabilityjs/test/index.html index 22cb8dbef3..e997271df7 100644 --- a/packages/readabilityjs/test/index.html +++ b/packages/readabilityjs/test/index.html @@ -20,6 +20,12 @@ [dom-distiller] +
  • caixin
    + [source] + [readability] + [dom-distiller] +
  • +
  • news.utexas
    [source] [readability] diff --git a/packages/readabilityjs/test/test-pages/caixin/distiller.html b/packages/readabilityjs/test/test-pages/caixin/distiller.html new file mode 100644 index 0000000000..adae61ed12 --- /dev/null +++ b/packages/readabilityjs/test/test-pages/caixin/distiller.html @@ -0,0 +1,20 @@ +

    +   【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。 +

    +   途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。 +

    + + +
    + 后获取已订阅的阅读权限 +
    + 财新通会员
    + 可畅读全文 +
    +

    +   推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。 +

    + 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028) +
    + 话题: +
    \ No newline at end of file