Skip to content

Commit

Permalink
[topstarnews] Fix article extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
bradenhilton committed Oct 17, 2024
1 parent 67c0d65 commit 3d6ae89
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 25 deletions.
24 changes: 3 additions & 21 deletions extractor/topstarnews.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,28 +62,10 @@ def metadata(self, page):
'"',
)[0],
)
or text.parse_datetime(
text.extr(
page,
'<i class="fa fa-clock-o fa-fw"></i>',
"</li>",
)
.strip()
.split(" ", maxsplit=1)[1],
format="%Y.%m.%d %H:%M",
utcoffset=9,
)
),
"author": text.extr(
page,
'<i class="fa fa-user-o fa-fw"></i>',
"</li>",
).strip(),
"views": text.parse_int(
text.extr(page, '<i class="fa fa-desktop fa-fw"></i>', "</li>").strip().split(" ", maxsplit=1)[1],
),
"author": text.extr(page, ' name="author" content="', '"').strip().replace(" 기자", ""),
"post_id": self.post_id,
"post_url": self.url,
"post_url": self.post_url,
}
if ' name="keywords" content="' in page:
data["tags"] = text.extr(page, ' name="keywords" content="', '"').split(",")
Expand All @@ -97,7 +79,7 @@ def items(self):

yield Message.Directory, data

article_body = text.extr(page, ' itemprop="articleBody">', '<div id="article-sns2"')
article_body = text.extr(page, 'itemprop="articleBody"', "</article>")

images = [
text.extr(figure, "<img", ">")
Expand Down
6 changes: 2 additions & 4 deletions test/results/topstarnews.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
"date": "dt:2024-09-11 10:54:00",
"title": "레드벨벳 웬디, ‘리본 하트 해달랬더니 근육 몽몽이가 돼버린 와니’ (웬디의 영스트리트 출근길)",
"tags": ["웬디", "WENDY", "영스트리트", "출근", "퇴근", "프리뷰"],
"author": "최규석 기자",
"views": int,
"author": "최규석",
"post_id": "15543685",
"post_url": "https://www.topstarnews.net/news/articleView.html?idxno=15543685",
},
Expand All @@ -29,8 +28,7 @@
"#count": 1,
"date": "dt:2012-04-24 06:45:00",
"title": "걸스데이(Girls Day) 혜리, '남자들은 다 똑같아!' 깜찍한 무대 …MBC MUSIC 쇼 챔피언 생방송 현장",
"author": "최규석 기자",
"views": int,
"author": "최규석",
"post_id": "30789",
"post_url": "https://www.topstarnews.net/news/articleView.html?idxno=30789",
},
Expand Down

0 comments on commit 3d6ae89

Please sign in to comment.