From 6545a15ff34dd43ac4d2a825d00f385f675be237 Mon Sep 17 00:00:00 2001 From: helloteemo Date: Thu, 11 Jul 2024 22:49:05 +0800 Subject: [PATCH] =?UTF-8?q?feature:=20=E6=94=AF=E6=8C=81=E5=B0=8F=E7=BA=A2?= =?UTF-8?q?=E4=B9=A6=E5=9B=BE=E7=89=87=E3=80=81=E8=A7=86=E9=A2=91=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/base_config.py | 1 + media_platform/xhs/client.py | 9 ++++++ media_platform/xhs/core.py | 63 +++++++++++++++++++++++++++++++++++- store/xhs/__init__.py | 31 +++++++++++++++--- store/xhs/xhs_store_image.py | 55 +++++++++++++++++++++++++++++++ 5 files changed, 153 insertions(+), 6 deletions(-) create mode 100644 store/xhs/xhs_store_image.py diff --git a/config/base_config.py b/config/base_config.py index 2c6f4c8e..e789b0fa 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -57,6 +57,7 @@ "6422c2750000000027000d88", "64ca1b73000000000b028dd2", "630d5b85000000001203ab41", + "668fe13000000000030241fa", # 图文混合 # ........................ ] diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 21eff5bb..e317b45e 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -129,6 +129,15 @@ async def post(self, uri: str, data: dict) -> Dict: return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=headers) + async def get_note_media(self, url: str) -> bytes | None: + async with httpx.AsyncClient(proxies=self.proxies) as client: + response = await client.request("GET", url, timeout=self.timeout) + if not response.reason_phrase == "OK": + utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}") + return None + else: + return response.content + async def pong(self) -> bool: """ 用于检查登录态是否失效了 diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 97d073f2..a9103aad 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -120,6 +120,7 @@ async def search(self) -> None: for note_detail in note_details: if note_detail is not None: await xhs_store.update_xhs_note(note_detail) + await self.get_notice_media(note_detail) note_id_list.append(note_detail.get("note_id")) page += 1 utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") @@ -171,6 +172,7 @@ async def get_specified_notes(self): for note_detail in note_details: if note_detail is not None: await xhs_store.update_xhs_note(note_detail) + await self.get_notice_media(note_detail) await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST) async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: @@ -276,4 +278,63 @@ async def launch_browser( async def close(self): """Close browser context""" await self.browser_context.close() - utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...") \ No newline at end of file + utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...") + + async def get_notice_media(self, note_detail: Dict): + if not config.ENABLE_GET_IMAGES: + utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled") + return + await self.get_note_images(note_detail) + await self.get_notice_video(note_detail) + + async def get_note_images(self, note_item: Dict): + """ + get note images. please use get_notice_media + :param note_item: + :return: + """ + if not config.ENABLE_GET_IMAGES: + return + note_id = note_item.get("note_id") + image_list: List[Dict] = note_item.get("image_list", []) + + for img in image_list: + if img.get('url_default') != '': + img.update({'url': img.get('url_default')}) + + if not image_list: + return + picNum = 0 + for pic in image_list: + url = pic.get("url") + if not url: + continue + content = await self.xhs_client.get_note_media(url) + if content is None: + continue + extension_file_name = f"{picNum}.jpg" + picNum += 1 + await xhs_store.update_xhs_note_image(note_id, content, extension_file_name) + + async def get_notice_video(self, note_item: Dict): + """ + get note images. please use get_notice_media + :param note_item: + :return: + """ + if not config.ENABLE_GET_IMAGES: + return + note_id = note_item.get("note_id") + + videos = xhs_store.get_video_url_arr(note_item) + + if not videos: + return + videoNum = 0 + for url in videos: + content = await self.xhs_client.get_note_media(url) + if content is None: + continue + extension_file_name = f"{videoNum}.mp4" + videoNum += 1 + await xhs_store.update_xhs_note_image(note_id, content, extension_file_name) diff --git a/store/xhs/__init__.py b/store/xhs/__init__.py index 689709a0..60881ff9 100644 --- a/store/xhs/__init__.py +++ b/store/xhs/__init__.py @@ -8,6 +8,7 @@ from . import xhs_store_impl from .xhs_store_impl import * +from .xhs_store_image import * class XhsStoreFactory: @@ -25,6 +26,25 @@ def create_store() -> AbstractStore: return store_class() +def get_video_url_arr(note_item: Dict) -> List: + if note_item.get('type') != 'video': + return [] + + videoArr = [] + originVideoKey = note_item.get('video').get('consumer').get('origin_video_key') + if originVideoKey == '': + originVideoKey = note_item.get('video').get('consumer').get('originVideoKey') + # 降级有水印 + if originVideoKey == '': + videos = note_item.get('video').get('media').get('stream').get('h264') + if type(videos).__name__ == 'list': + videoArr = [v.get('master_url') for v in videos] + else: + videoArr = [f"http://sns-video-bd.xhscdn.com/{originVideoKey}"] + + return videoArr + + async def update_xhs_note(note_item: Dict): note_id = note_item.get("note_id") user_info = note_item.get("user", {}) @@ -36,11 +56,7 @@ async def update_xhs_note(note_item: Dict): if img.get('url_default') != '': img.update({'url': img.get('url_default')}) - video_url = '' - if note_item.get('type') == 'video': - videos = note_item.get('video').get('media').get('stream').get('h264') - if type(videos).__name__ == 'list': - video_url = ','.join([v.get('master_url') for v in videos]) + video_url = ','.join(get_video_url_arr(note_item)) local_db_item = { "note_id": note_item.get("note_id"), @@ -127,3 +143,8 @@ async def save_creator(user_id: str, creator: Dict): } utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}") await XhsStoreFactory.create_store().store_creator(local_db_item) + + +async def update_xhs_note_image(note_id, pic_content, extension_file_name): + await XiaoHongShuImage().store_image( + {"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name}) diff --git a/store/xhs/xhs_store_image.py b/store/xhs/xhs_store_image.py new file mode 100644 index 00000000..86e4d3d6 --- /dev/null +++ b/store/xhs/xhs_store_image.py @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# @Author : helloteemo +# @Time : 2024/7/11 22:35 +# @Desc : 小红书图片保存 +import pathlib +from typing import Dict + +import aiofiles + +from base.base_crawler import AbstractStoreImage +from tools import utils + + +class XiaoHongShuImage(AbstractStoreImage): + image_store_path: str = "data/xhs/images" + + async def store_image(self, image_content_item: Dict): + """ + store content + Args: + content_item: + + Returns: + + """ + await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"), + image_content_item.get("extension_file_name")) + + def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str: + """ + make save file name by store type + Args: + notice_id: notice id + picid: image id + + Returns: + + """ + return f"{self.image_store_path}/{notice_id}/{extension_file_name}" + + async def save_image(self, notice_id: str, pic_content: str, extension_file_name="jpg"): + """ + save image to local + Args: + notice_id: notice id + pic_content: image content + + Returns: + + """ + pathlib.Path(self.image_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True) + save_file_name = self.make_save_file_name(notice_id, extension_file_name) + async with aiofiles.open(save_file_name, 'wb') as f: + await f.write(pic_content) + utils.logger.info(f"[XiaoHongShuImageStoreImplement.save_image] save image {save_file_name} success ...")