From 45b7c2b619e64d3095477ef5bb1c908ac379a19a Mon Sep 17 00:00:00 2001
From: Hongbo Wu
Date: Thu, 28 Sep 2023 17:14:17 +0800
Subject: [PATCH 1/4] get published date from time element
---
packages/readabilityjs/Readability.js | 12 ++++++++++++
.../nytimes-podcasts/expected-metadata.json | 2 +-
.../test/test-pages/nytimes-podcasts/expected.html | 2 --
.../test/test-pages/nytimes.com/expected.html | 2 --
4 files changed, 13 insertions(+), 5 deletions(-)
diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js
index abdbfa8d0e..65de682751 100644
--- a/packages/readabilityjs/Readability.js
+++ b/packages/readabilityjs/Readability.js
@@ -1081,6 +1081,18 @@ Readability.prototype = {
}
// we don't want to check for dates in the URL's
if (node.tagName.toLowerCase() === 'a') return
+ // get the datetime from time element
+ if (node.tagName.toLowerCase() === 'time') {
+ const datetime = node.getAttribute('datetime')
+ if (datetime) {
+ const date = new Date(datetime)
+ if (!isNaN(date)) {
+ this._articlePublishedDate = date
+ return true
+ }
+ }
+ }
+
// Searching for the real date in the text content
const content = node.textContent.trim()
let dateFound
diff --git a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json
index 61fe09886c..d2bc065e6d 100644
--- a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json
+++ b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected-metadata.json
@@ -4,7 +4,7 @@
"dir": null,
"excerpt": "The Sept. 27, 2022 episode of “The Ezra Klein Show”",
"siteName": "fakehost",
- "siteIcon": "/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico",
+ "siteIcon": "http://fakehost/vi-assets/static-assets/favicon-d2483f10ef688e6f89e23806b9700298.ico",
"previewImage": "https://static01.nyt.com/newsgraphics/images/icons/defaultPromoCrop.png",
"publishedDate": "2022-09-27T16:25:17.221Z",
"language": "English",
diff --git a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html
index 8d79daf8c2..725bef0943 100644
--- a/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html
+++ b/packages/readabilityjs/test/test-pages/nytimes-podcasts/expected.html
@@ -5,8 +5,6 @@
The Ezra Klein Show
-
-
diff --git a/packages/readabilityjs/test/test-pages/nytimes.com/expected.html b/packages/readabilityjs/test/test-pages/nytimes.com/expected.html
index ceb0aa925a..9777d24a69 100644
--- a/packages/readabilityjs/test/test-pages/nytimes.com/expected.html
+++ b/packages/readabilityjs/test/test-pages/nytimes.com/expected.html
@@ -30,8 +30,6 @@
-
-
From 310ad5de1df5b804298874461e9641ef26e90017 Mon Sep 17 00:00:00 2001
From: Hongbo Wu
Date: Thu, 28 Sep 2023 17:35:33 +0800
Subject: [PATCH 2/4] get published date from url
---
packages/readabilityjs/Readability.js | 24 +-
.../test-pages/caixin/expected-metadata.json | 11 +
.../test/test-pages/caixin/expected.html | 45 +
.../test/test-pages/caixin/source.html | 2275 +++++++++++++++++
.../test/test-pages/caixin/url.txt | 1 +
5 files changed, 2354 insertions(+), 2 deletions(-)
create mode 100644 packages/readabilityjs/test/test-pages/caixin/expected-metadata.json
create mode 100644 packages/readabilityjs/test/test-pages/caixin/expected.html
create mode 100644 packages/readabilityjs/test/test-pages/caixin/source.html
create mode 100644 packages/readabilityjs/test/test-pages/caixin/url.txt
diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js
index 65de682751..2934841bd4 100644
--- a/packages/readabilityjs/Readability.js
+++ b/packages/readabilityjs/Readability.js
@@ -76,6 +76,22 @@ const extractPublishedDateFromAuthor = (author)=> {
return [authorName, null];
};
+// extract published date from url if it's in the format of yyyy/mm/dd or yyyy-mm-dd
+const extractPublishedDateFromUrl = (url) => {
+ if (!url) return null;
+
+ const regex = /(\d{4})(\/|-)(\d{2})(\/|-)(\d{2})/i;
+ const match = url.match(regex);
+ if (match) {
+ const year = parseInt(match[1], 10);
+ const month = parseInt(match[3], 10) - 1; // January is 0 in JavaScript Date
+ const day = parseInt(match[5], 10);
+
+ return new Date(year, month, day);
+ }
+ return null;
+}
+
/**
* Public constructor.
* @param {Document} doc The document to parse.
@@ -3068,7 +3084,11 @@ Readability.prototype = {
return null;
const byline = metadata.byline || this._articleByline;
- const [author, publishedAt] = extractPublishedDateFromAuthor(byline);
+ const [author, publishedDateFromAuthor] = extractPublishedDateFromAuthor(byline);
+ const publishedDate = metadata.publishedDate ||
+ extractPublishedDateFromUrl(this._documentURI) ||
+ publishedDateFromAuthor ||
+ this._articlePublishedDate;
this._postProcessContent(articleContent);
@@ -3104,7 +3124,7 @@ Readability.prototype = {
siteName: metadata.siteName,
siteIcon: metadata.siteIcon,
previewImage: metadata.previewImage,
- publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate,
+ publishedDate,
language: this._getLanguage(metadata.locale || this._languageCode),
};
}
diff --git a/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json b/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json
new file mode 100644
index 0000000000..b974abcc29
--- /dev/null
+++ b/packages/readabilityjs/test/test-pages/caixin/expected-metadata.json
@@ -0,0 +1,11 @@
+{
+ "title": "途虎养车港交所挂牌 腾讯为最大外部股东",
+ "byline": "文|财新 余聪",
+ "dir": null,
+ "excerpt": "途虎养车 腾讯国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%",
+ "siteName": "fakehost",
+ "previewImage": "https://img.caixin.com/2023-09-26/169572084568190_560_373.jpg",
+ "publishedDate": "2023-09-25T16:00:00.000Z",
+ "language": "English",
+ "readerable": true
+}
diff --git a/packages/readabilityjs/test/test-pages/caixin/expected.html b/packages/readabilityjs/test/test-pages/caixin/expected.html
new file mode 100644
index 0000000000..feb2717e52
--- /dev/null
+++ b/packages/readabilityjs/test/test-pages/caixin/expected.html
@@ -0,0 +1,45 @@
+
+
+
+
途虎养车港交所挂牌 腾讯为最大外部股东
+
+
+
+
+
+
文|财新 余聪
+
2023年09月26日 17:22
+
+
+
+
试听
+
+
国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%
+
+
+
【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
+
途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。
+
+
+
+
+
+
登录 后获取已订阅的阅读权限
+
+
+
+
+
+
+
+
+
+
+
+
+
推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。
+
责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)
+
+
\ No newline at end of file
diff --git a/packages/readabilityjs/test/test-pages/caixin/source.html b/packages/readabilityjs/test/test-pages/caixin/source.html
new file mode 100644
index 0000000000..cb9392fb1e
--- /dev/null
+++ b/packages/readabilityjs/test/test-pages/caixin/source.html
@@ -0,0 +1,2275 @@
+
+
+
+
+
+
+
+
+
+
+
+ 途虎养车港交所挂牌 腾讯为最大外部股东_财新网_财新网
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -
+ 发表评论
+
+ -
+
+
+
分享到微信朋友圈
+
+ -
+ 新浪转发
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 途虎养车港交所挂牌 腾讯为最大外部股东
+
+
+
2023-09-26 17:22:53来源: 财新网作者:余聪责任编辑:屈运栩
+
+
+
+
+ 文|财新 余聪
+
+ 2023年09月26日 17:22
+
+
+
试听
+
+
+ 国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%
+
+
+
+
+
+
+
+
+
+
+
+ 【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
+
+
+ 途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。
+
+
+
+
+
+
+
+
+
+
+
页面加载中...
+
+
+
+
+
+
+
+
+ 财新通会员
+ 可畅读全文
+
订阅/会员升级
+
+
+
+
+
+
+
+
+
+
+
+
+ 试听
+
+
+ 途虎养车港交所挂牌 腾讯为最大外部股东
+
+
+
+
+ -
+ 音频:
+
+ -
+
+
+ 试听
+
+
+ 途虎养车港交所挂牌 腾讯为最大外部股东
+
+
+
+
+
+
+
+
+
+ 开通财新通音频版、立享完整音频
+
+
+ 音频是独立的收费产品,订阅财新通音频版即可拥有所有财新通文章的文字阅读权限以及音频收听权限,随时随地畅听无阻。
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。
+
+
+
+
+ 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)
+
+
+
+
+
+
+
+
+ -
+ 话题:
+
+ -
+ #港交所+关注
+
+ -
+ #腾讯+关注
+
+ -
+ #京东+关注
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ -
+ 酒类网销月报
+
+
+ -
+ 光伏观察
+
+
+ -
+ 草根调研
+
+
+
+
+
+
+ -
+
+
+
+ -
+
+
+
+ -
+
+
热点人物
+
+
+ 朱加麟
+
+
+ 中融人寿保险股份有限公司代理董事长、副董事长
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 财新网新闻版电邮全新升级!财新网主编精心编写,每个工作日定时投递,篇篇重磅,可信可引。
+
+
+ 订阅
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/packages/readabilityjs/test/test-pages/caixin/url.txt b/packages/readabilityjs/test/test-pages/caixin/url.txt
new file mode 100644
index 0000000000..dbae9da787
--- /dev/null
+++ b/packages/readabilityjs/test/test-pages/caixin/url.txt
@@ -0,0 +1 @@
+https://www.caixin.com/2023-09-26/102112537.html
\ No newline at end of file
From 001403c02dae058d5ed800e3f12f172d1973d9f8 Mon Sep 17 00:00:00 2001
From: Hongbo Wu
Date: Thu, 28 Sep 2023 17:36:27 +0800
Subject: [PATCH 3/4] fix tests
---
packages/readabilityjs/test/test-pages/caixin/expected.html | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/packages/readabilityjs/test/test-pages/caixin/expected.html b/packages/readabilityjs/test/test-pages/caixin/expected.html
index feb2717e52..1d85d1b14e 100644
--- a/packages/readabilityjs/test/test-pages/caixin/expected.html
+++ b/packages/readabilityjs/test/test-pages/caixin/expected.html
@@ -14,7 +14,7 @@ 途虎养车港交所挂牌 腾讯为最大外部股东 试听
+ 试听
国内汽车服务市场高度分散,2022年,途虎养车取得汽车服务收入115亿元,市场份额0.9%
@@ -42,4 +42,4 @@ 途虎养车港交所挂牌 腾讯为最大外部股东 财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。
责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)
-
\ No newline at end of file
+
From 2b23b0e002cceb61f99c8d71916f3e3c3ea33932 Mon Sep 17 00:00:00 2001
From: sywhb
Date: Thu, 28 Sep 2023 09:37:48 +0000
Subject: [PATCH 4/4] Update generated html
---
packages/readabilityjs/test/index.html | 6 ++++++
.../test/test-pages/caixin/distiller.html | 20 +++++++++++++++++++
2 files changed, 26 insertions(+)
create mode 100644 packages/readabilityjs/test/test-pages/caixin/distiller.html
diff --git a/packages/readabilityjs/test/index.html b/packages/readabilityjs/test/index.html
index 22cb8dbef3..e997271df7 100644
--- a/packages/readabilityjs/test/index.html
+++ b/packages/readabilityjs/test/index.html
@@ -20,6 +20,12 @@
[dom-distiller]
+ caixin
+ [source]
+ [readability]
+ [dom-distiller]
+
+
news.utexas
[source]
[readability]
diff --git a/packages/readabilityjs/test/test-pages/caixin/distiller.html b/packages/readabilityjs/test/test-pages/caixin/distiller.html
new file mode 100644
index 0000000000..adae61ed12
--- /dev/null
+++ b/packages/readabilityjs/test/test-pages/caixin/distiller.html
@@ -0,0 +1,20 @@
+
+ 【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
+
+ 途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。
+
+
+
+
+ 后获取已订阅的阅读权限
+
+ 财新通会员
+ 可畅读全文
+
+
+ 推荐进入财新数据库,可随时查阅公司股价走势、结构人员变化等投资信息。
+
+ 责任编辑:屈运栩 | 版面编辑:刘潇(ZN028)
+
+ 话题:
+
\ No newline at end of file