feat: xhs笔记详情更新

NanmiCoder · Jul 24, 2024 · 573ca9a · 573ca9a
1 parent 678b358
commit 573ca9a
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 37 deletions.
diff --git a/config/base_config.py b/config/base_config.py
@@ -40,7 +40,7 @@
 CRAWLER_MAX_NOTES_COUNT = 20
 
 # 并发爬虫数量控制
-MAX_CONCURRENCY_NUM = 4
+MAX_CONCURRENCY_NUM = 1
 
 # 是否开启爬图片模式, 默认不开启爬图片
 ENABLE_GET_IMAGES = False

diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py
@@ -198,27 +198,34 @@ async def get_note_by_keyword(
         }
         return await self.post(uri, data)
 
-    async def get_note_by_id(self, note_id: str) -> Dict:
+    async def get_note_by_id(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict:
         """
         获取笔记详情API
         Args:
             note_id:笔记ID
+            xsec_source: 渠道来源
+            xsec_token: 搜索关键字之后返回的比较列表中返回的token
 
         Returns:
 
         """
+        if xsec_source == "":
+            xsec_source = "pc_search"
+
         data = {
             "source_note_id": note_id,
             "image_formats": ["jpg", "webp", "avif"],
             "extra": {"need_body_topic": 1},
-            "xsec_source": "pc_feed",
+            "xsec_source": xsec_source,
+            "xsec_token": xsec_token
         }
         uri = "/api/sns/web/v1/feed"
         res = await self.post(uri, data)
         if res and res.get("items"):
             res_dict: Dict = res["items"][0]["note_card"]
             return res_dict
-        utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note empty and res:{res}")
+        # 爬取频繁了可能会出现有的笔记能有结果有的没有
+        utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
         return dict()
 
     async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
@@ -292,9 +299,9 @@ async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
             sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback)
             result.extend(sub_comments)
         return result
-    
+
     async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0,
-                                    callback: Optional[Callable] = None) -> List[Dict]:
+                                            callback: Optional[Callable] = None) -> List[Dict]:
         """
         获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
         Args:
@@ -306,23 +313,24 @@ async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interv
         
         """
         if not config.ENABLE_GET_SUB_COMMENTS:
-            utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
+            utils.logger.info(
+                f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
             return []
-        
+
         result = []
         for comment in comments:
             note_id = comment.get("note_id")
             sub_comments = comment.get("sub_comments")
             if sub_comments and callback:
                 await callback(note_id, sub_comments)
-                
+
             sub_comment_has_more = comment.get("sub_comment_has_more")
             if not sub_comment_has_more:
                 continue
 
             root_comment_id = comment.get("id")
             sub_comment_cursor = comment.get("sub_comment_cursor")
-        
+
             while sub_comment_has_more:
                 comments_res = await self.get_note_sub_comments(note_id, root_comment_id, 10, sub_comment_cursor)
                 sub_comment_has_more = comments_res.get("has_more", False)
@@ -398,17 +406,20 @@ async def get_all_notes_by_creator(self, user_id: str, crawl_interval: float = 1
         while notes_has_more:
             notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
             if not notes_res:
-                utils.logger.error(f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
+                utils.logger.error(
+                    f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
                 break
 
             notes_has_more = notes_res.get("has_more", False)
             notes_cursor = notes_res.get("cursor", "")
             if "notes" not in notes_res:
-                utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
+                utils.logger.info(
+                    f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
                 break
 
             notes = notes_res["notes"]
-            utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
+            utils.logger.info(
+                f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
             if callback:
                 await callback(notes)
             await asyncio.sleep(crawl_interval)

diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
@@ -27,7 +27,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
 
     def __init__(self) -> None:
         self.index_url = "https://www.xiaohongshu.com"
-        self.user_agent = utils.get_user_agent()
+        # self.user_agent = utils.get_user_agent()
+        self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
 
     async def start(self) -> None:
         playwright_proxy_format, httpx_proxy_format = None, None
@@ -110,18 +111,23 @@ async def search(self) -> None:
                         sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
                     )
                     utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
-                    if(not notes_res or not notes_res.get('has_more', False)):
+                    if not notes_res or not notes_res.get('has_more', False):
                         utils.logger.info("No more content!")
                         break
                     semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
                     task_list = [
-                        self.get_note_detail(post_item.get("id"), semaphore)
+                        self.get_note_detail(
+                            note_id=post_item.get("id"),
+                            xsec_source=post_item.get("xsec_source"),
+                            xsec_token=post_item.get("xsec_token"),
+                            semaphore=semaphore
+                        )
                         for post_item in notes_res.get("items", {})
                         if post_item.get('model_type') not in ('rec_query', 'hot_query')
                     ]
                     note_details = await asyncio.gather(*task_list)
                     for note_detail in note_details:
-                        if note_detail is not None:
+                        if note_detail:
                             await xhs_store.update_xhs_note(note_detail)
                             await self.get_notice_media(note_detail)
                             note_id_list.append(note_detail.get("note_id"))
@@ -157,32 +163,42 @@ async def fetch_creator_notes_detail(self, note_list: List[Dict]):
         """
         semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
         task_list = [
-            self.get_note_detail(post_item.get("note_id"), semaphore) for post_item in note_list
+            self.get_note_detail(
+                note_id=post_item.get("id"),
+                xsec_source=post_item.get("xsec_source"),
+                xsec_token=post_item.get("xsec_token"),
+                semaphore=semaphore
+            )
+            for post_item in note_list
         ]
 
         note_details = await asyncio.gather(*task_list)
         for note_detail in note_details:
-            if note_detail is not None:
+            if note_detail:
                 await xhs_store.update_xhs_note(note_detail)
 
     async def get_specified_notes(self):
         """Get the information and comments of the specified post"""
-        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
-        task_list = [
-            self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST
-        ]
-        note_details = await asyncio.gather(*task_list)
-        for note_detail in note_details:
-            if note_detail is not None:
-                await xhs_store.update_xhs_note(note_detail)
-                await self.get_notice_media(note_detail)
-        await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
-
-    async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
+        # todo 指定帖子爬取暂时失效，xhs更新了帖子详情的请求参数，需要携带xsec_token，目前发现该参数只能在搜索场景下获取到
+        raise Exception(
+            "指定帖子爬取暂时失效，xhs更新了帖子详情的请求参数，需要携带xsec_token，目前发现只能在搜索场景下获取到")
+        # semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
+        # task_list = [
+        #     self.get_note_detail(note_id=note_id, xsec_token="", semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST
+        # ]
+        # note_details = await asyncio.gather(*task_list)
+        # for note_detail in note_details:
+        #     if note_detail is not None:
+        #         await xhs_store.update_xhs_note(note_detail)
+        #         await self.get_notice_media(note_detail)
+        # await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
+
+    async def get_note_detail(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
+            Optional[Dict]:
         """Get note detail"""
         async with semaphore:
             try:
-                return await self.xhs_client.get_note_by_id(note_id)
+                return await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
             except DataFetchError as ex:
                 utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
                 return None

diff --git a/media_platform/xhs/help.py b/media_platform/xhs/help.py
@@ -10,19 +10,19 @@ def sign(a1="", b1="", x_s="", x_t=""):
     takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t".
     """
     common = {
-        "s0": 5,  # getPlatformCode
+        "s0": 3,  # getPlatformCode
         "s1": "",
         "x0": "1",  # localStorage.getItem("b1b1")
-        "x1": "3.3.0",  # version
-        "x2": "Windows",
+        "x1": "3.7.8-2",  # version
+        "x2": "Mac OS",
         "x3": "xhs-pc-web",
-        "x4": "1.4.4",
+        "x4": "4.27.2",
         "x5": a1,  # cookie of a1
         "x6": x_t,
         "x7": x_s,
         "x8": b1,  # localStorage.getItem("b1")
         "x9": mrc(x_t + x_s + b1),
-        "x10": 1,  # getSigCount
+        "x10": 154,  # getSigCount
     }
     encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
     x_s_common = b64Encode(encode_str)

diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py
@@ -10,6 +10,7 @@
 from .bilibili_store_impl import *
 from .bilibilli_store_video import *
 
+
 class BiliStoreFactory:
     STORES = {
         "csv": BiliCsvStoreImplement,

diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py
@@ -33,6 +33,9 @@ def calculate_number_of_files(file_store_path: str) -> int:
 
 
 class WeiboCsvStoreImplement(AbstractStore):
+    async def store_creator(self, creator: Dict):
+        pass
+
     csv_store_path: str = "data/weibo"
     file_count:int=calculate_number_of_files(csv_store_path)