From 3d6ae89ff5d5ebf6f19ccc53c10f6e0067530db7 Mon Sep 17 00:00:00 2001 From: Braden Hilton Date: Thu, 17 Oct 2024 15:44:23 +0100 Subject: [PATCH] [topstarnews] Fix article extraction --- extractor/topstarnews.py | 24 +++--------------------- test/results/topstarnews.py | 6 ++---- 2 files changed, 5 insertions(+), 25 deletions(-) diff --git a/extractor/topstarnews.py b/extractor/topstarnews.py index 20ec84d..3892673 100644 --- a/extractor/topstarnews.py +++ b/extractor/topstarnews.py @@ -62,28 +62,10 @@ def metadata(self, page): '"', )[0], ) - or text.parse_datetime( - text.extr( - page, - '', - "", - ) - .strip() - .split(" ", maxsplit=1)[1], - format="%Y.%m.%d %H:%M", - utcoffset=9, - ) - ), - "author": text.extr( - page, - '', - "", - ).strip(), - "views": text.parse_int( - text.extr(page, '', "").strip().split(" ", maxsplit=1)[1], ), + "author": text.extr(page, ' name="author" content="', '"').strip().replace(" 기자", ""), "post_id": self.post_id, - "post_url": self.url, + "post_url": self.post_url, } if ' name="keywords" content="' in page: data["tags"] = text.extr(page, ' name="keywords" content="', '"').split(",") @@ -97,7 +79,7 @@ def items(self): yield Message.Directory, data - article_body = text.extr(page, ' itemprop="articleBody">', '
") images = [ text.extr(figure, "") diff --git a/test/results/topstarnews.py b/test/results/topstarnews.py index 6ec36d6..2389d45 100644 --- a/test/results/topstarnews.py +++ b/test/results/topstarnews.py @@ -15,8 +15,7 @@ "date": "dt:2024-09-11 10:54:00", "title": "레드벨벳 웬디, ‘리본 하트 해달랬더니 근육 몽몽이가 돼버린 와니’ (웬디의 영스트리트 출근길)", "tags": ["웬디", "WENDY", "영스트리트", "출근", "퇴근", "프리뷰"], - "author": "최규석 기자", - "views": int, + "author": "최규석", "post_id": "15543685", "post_url": "https://www.topstarnews.net/news/articleView.html?idxno=15543685", }, @@ -29,8 +28,7 @@ "#count": 1, "date": "dt:2012-04-24 06:45:00", "title": "걸스데이(Girls Day) 혜리, '남자들은 다 똑같아!' 깜찍한 무대 …MBC MUSIC 쇼 챔피언 생방송 현장", - "author": "최규석 기자", - "views": int, + "author": "최규석", "post_id": "30789", "post_url": "https://www.topstarnews.net/news/articleView.html?idxno=30789", },