Skip to content

Commit

Permalink
[imbc] Add article extractors
Browse files Browse the repository at this point in the history
  • Loading branch information
bradenhilton committed Sep 23, 2024
1 parent cf70375 commit 30dbb2b
Show file tree
Hide file tree
Showing 2 changed files with 193 additions and 0 deletions.
157 changes: 157 additions & 0 deletions extractor/imbc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
"""Extractors for https://imbc.com"""

import re
from http import HTTPStatus

from gallery_dl import exception, text
from gallery_dl.extractor.common import Extractor, Message


class ImbcExtractor(Extractor):
category = "imbc"
filename_fmt = "{filename}.{extension}"

def _call(self, url, params=None):
if params is None:
params = {}
while True:
response = self.request(url, params=params, fatal=None, allow_redirects=False)
if response.status_code < HTTPStatus.MULTIPLE_CHOICES:
return response.text
if response.status_code == HTTPStatus.UNAUTHORIZED:
raise exception.AuthenticationError from None
if response.status_code == HTTPStatus.FORBIDDEN:
raise exception.AuthorizationError from None
if response.status_code == HTTPStatus.NOT_FOUND:
raise exception.NotFoundError(self.subcategory) from None
self.log.debug(response.text)
msg = "Request failed"
raise exception.StopExtraction(msg)

def metadata(self, page):
return {
"title": text.unescape(text.extr(page, "<title>", "</title>")),
"article_id": self.article_id,
"post_url": text.extr(page, 'proptery="og:url" content="', '"'),
}


class ImbcAdenewsArticleExtractor(ImbcExtractor):
"""Extractors for articles on adenews.imbc.com"""

subcategory = "adenews-article"
root = "https://adenews.imbc.com"
directory_fmt = ("{category}", "{subcategory}", "{article_id}")
archive_fmt = "{category}_{subcategory}_{article_id}_{filename}"
pattern = r"(?:https?://)?adenews\.imbc\.com/M/Detail/(\d+)"
example = "https://adenews.imbc.com/M/Detail/12345"

def __init__(self, match):
ImbcExtractor.__init__(self, match)
self.article_id = match.group(1)

def items(self):
page = self._call(self.url)
data = self.metadata(page)
data["post_url"] = f"{self.root}/M/Detail/{self.article_id}"
data["date"] = text.parse_datetime(
text.extr(text.extr(page, '<div class="date">', "</div>"), "<span>", "</span>"),
format="%Y-%m-%d %H:%M",
utcoffset=9,
)

article_content = text.extr(page, "<!-- 기사 상세 Start -->", "<!-- 네이티브 End -->")
urls = [
text.ensure_http_scheme(text.extr(image, 'src="', '"'))
for image in text.extract_iter(article_content, "<img", ">")
]

yield Message.Directory, data

for data["num"], url in enumerate(urls, 1):
image = {"url": url}
data["image"] = image
text.nameext_from_url(url, data)
yield Message.Url, url, data


class ImbcEnewsArticleExtractor(ImbcExtractor):
"""Extractors for articles on enews.imbc.com"""

subcategory = "enews-article"
root = "https://enews.imbc.com"
directory_fmt = ("{category}", "{subcategory}", "{article_id}")
archive_fmt = "{category}_{subcategory}_{article_id}_{filename}"
pattern = r"(?:https?://)?enews\.imbc\.com/News/RetrieveNewsInfo/(\d+)"
example = "https://enews.imbc.com/News/RetrieveNewsInfo/12345"

def __init__(self, match):
ImbcExtractor.__init__(self, match)
self.article_id = match.group(1)

def items(self):
page = self._call(self.url)
data = self.metadata(page)
data["date"] = text.parse_datetime(
re.sub(r"Z$", "", text.extr(page, 'property="article:published_time" content="', '"')),
format="%Y-%m-%dT%H:%M:%S",
utcoffset=9,
)

article_content = text.extr(page, "<!-- 기사 본문 내용 Start -->", "<!-- 기사 본문 내용 End -->")
urls = [
text.ensure_http_scheme(text.extr(image, 'src="', '"'))
for image in text.extract_iter(article_content, "<img", ">")
]

yield Message.Directory, data

for data["num"], url in enumerate(urls, 1):
image = {"url": url}
data["image"] = image
text.nameext_from_url(url, data)
yield Message.Url, url, data


class ImbcImnewsArticleExtractor(ImbcExtractor):
"""Extractors for articles on imnews.imbc.com"""

subcategory = "imnews-article"
root = "https://imnews.imbc.com"
directory_fmt = (
"{category}",
"{subcategory}",
"{article_id}",
)
archive_fmt = "{category}_{subcategory}_{article_id}_{filename}"
pattern = r"(?:https?://)?imnews\.imbc\.com/news/\d+/\w+/article/(\d+_\d+).html"
example = "https://imnews.imbc.com/news/2024/enter/article/12345_67890.html"

def __init__(self, match):
ImbcExtractor.__init__(self, match)
self.article_id = match.group(1)

def items(self):
page = self._call(self.url)
data = self.metadata(page)
data["date"] = text.parse_datetime(
text.extr(page, 'property="article:published_time" content="', '"'), format="%Y-%m-%dT%H:%M:%S%z"
)

article_content = text.extr(
page,
"<!-- // 전체재생 레이어 -->",
"<!-- iMBC 연예 데이블 본문 바로 밑 -->",
)
urls = [
text.ensure_http_scheme(text.extr(image, 'src="', '"'))
for image in text.extract_iter(article_content, "<img", ">")
]

yield Message.Directory, data

for data["num"], url in enumerate(urls, 1):
image = {"url": url}
data["image"] = image
text.nameext_from_url(url, data)
yield Message.Url, url, data
36 changes: 36 additions & 0 deletions test/results/imbc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from extractor import imbc

IMAGE_URL_PATTERN = (
r"(?i)https://talkimg\.imbc\.com/TVianUpload/tvian/TViews/image/\d+/\d+/\d+/.+\.(?:gif|jpe?g|png|webp)"
)


__tests__ = (
{
"#url": "https://adenews.imbc.com/M/Detail/431009",
"#category": ("", "imbc", "adenews-article"),
"#class": imbc.ImbcAdenewsArticleExtractor,
"#pattern": IMAGE_URL_PATTERN,
"#count": 40,
"title": "[움짤] 아이브 안유진, 숨 참고 럽럽~",
"date": "dt:2024-09-22 17:11:00",
},
{
"#url": "https://enews.imbc.com/News/RetrieveNewsInfo/431009",
"#category": ("", "imbc", "enews-article"),
"#class": imbc.ImbcEnewsArticleExtractor,
"#pattern": IMAGE_URL_PATTERN,
"#count": 40,
"title": "[움짤] 아이브 안유진, 숨 참고 럽럽~",
"date": "dt:2024-09-22 17:11:00",
},
{
"#url": "https://imnews.imbc.com/news/2024/enter/article/6639093_36473.html",
"#category": ("", "imbc", "imnews-article"),
"#class": imbc.ImbcImnewsArticleExtractor,
"#pattern": IMAGE_URL_PATTERN,
"#count": 40,
"title": "[움짤] 아이브 안유진, 숨 참고 럽럽~",
"date": "dt:2024-09-22 17:11:52",
},
)

0 comments on commit 30dbb2b

Please sign in to comment.