From 9f11a802fedda2580aff7bd5b7a7b3ebb38087ea Mon Sep 17 00:00:00 2001 From: Samiya Date: Thu, 7 Mar 2024 01:53:03 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BC=98=E5=8C=96=E4=B8=80=E5=A4=A7=E5=A0=86?= =?UTF-8?q?=EF=BC=8C=E6=87=92=E5=BE=97=E5=86=99commit=E4=BA=86=EF=BC=8C?= =?UTF-8?q?=E5=90=8E=E9=9D=A2=E8=A1=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TODO.md | 8 ++- bot.py | 79 ---------------------- common/http_client/async_aiohttp_client.py | 4 +- common/http_client/async_httpx_client.py | 4 +- common/utils/utils.py | 2 +- config/README.md | 0 config/example.env | 14 ++-- config/hoyoyo.toml | 15 ++++ config/lashinbang.toml | 8 ++- config/mercari.toml | 12 ++-- config/mercari_user.toml | 4 +- config/notify.toml | 6 +- config/rennigou.toml | 73 ++++++++++++-------- docker-compose.yaml | 3 +- docker-swarm.yaml | 42 ++++++++++++ requirements.txt | 3 +- time_test.sh | 8 --- website/base/scraper.py | 7 -- website/base/scraper_mercari.py | 8 +-- website/hoyoyo.py | 16 +++-- website/jumpshop.py | 9 +-- website/lashinbang.py | 22 ++++-- website/mercari_items.py | 7 +- website/mercari_search.py | 26 +++---- website/rennigou.py | 61 ++++++++++------- website/suruga.py | 47 +++++++------ 26 files changed, 254 insertions(+), 234 deletions(-) delete mode 100644 bot.py delete mode 100644 config/README.md create mode 100644 config/hoyoyo.toml create mode 100644 docker-swarm.yaml diff --git a/TODO.md b/TODO.md index 75c884a..e30a543 100644 --- a/TODO.md +++ b/TODO.md @@ -9,4 +9,10 @@ - [ ] 配置文件热重载问题 - [ ] 将监控系统接入telegram bot,以允许随时添加用户配置文件 或者停止用户任务 - [ ] Alpine的一键DD脚本 -- [ ] 对于煤炉推送时间不一样的现象 观察它属于哪个排序? \ No newline at end of file +- [ ] 对于煤炉推送时间不一样的现象 观察它属于哪个排序? +- [ ] 煤炉缓存问题 1. 请求链接加时间戳 2. 请求头中的Origin和Accept-Encoding是否有影响 +- [ ] https://api.mercari.jp/users/get_profile?user_id=193978404&_user_format=profile 该接口返回email和phone_number,不知是否需要登录才返回? +- [ ] 煤炉过滤器的值,映射 +- [ ] paypay和fril的过滤还没对 +- [x] 请求链接是否一定要https? +- [ ] \ No newline at end of file diff --git a/bot.py b/bot.py deleted file mode 100644 index 5a8159c..0000000 --- a/bot.py +++ /dev/null @@ -1,79 +0,0 @@ -from telebot.async_telebot import AsyncTeleBot -import asyncio - - -API_TOKEN = "" -bot = AsyncTeleBot(API_TOKEN) - -# 根据您的项目结构,您可能需要导入相关模块 -# from your_project_module import user_task_manager, user_config_manager - - -# 示例命令处理函数 -@bot.message_handler(commands=["start", "help"]) -async def send_welcome(message): - await bot.reply_to(message, "您好!这是您的帮助信息。") - - -@bot.message_handler(commands=["status"]) -async def send_status(message): - # 获取所有用户的任务状态 - # 例如:status = user_task_manager.get_all_user_status() - status = "这里是所有用户的任务状态" # 模拟状态 - await bot.reply_to(message, status) - - -@bot.message_handler(commands=["stop"]) -async def stop_user_task(message): - # 停止特定用户的任务 - username = message.text.split()[1] # 假设命令格式为 /stop username - # 例如:result = user_task_manager.stop_user_task(username) - result = f"{username}的任务已停止" # 模拟结果 - await bot.reply_to(message, result) - - -@bot.message_handler(commands=["adduser"]) -async def add_user(message): - # 添加新用户 - user_details = message.text.split()[1:] # 假设命令格式为 /adduser details - # 例如:result = user_config_manager.add_user(user_details) - result = "新用户已添加" # 模拟结果 - await bot.reply_to(message, result) - - -@bot.message_handler(commands=["edituser"]) -async def edit_user(message): - # 修改用户配置 - user_info = message.text.split()[1:] # 假设命令格式为 /edituser username new_details - # 例如:result = user_config_manager.edit_user(user_info) - result = "用户配置已修改" # 模拟结果 - await bot.reply_to(message, result) - - -@bot.message_handler(commands=["deleteuser"]) -async def delete_user(message): - # 删除用户 - username = message.text.split()[1] # 假设命令格式为 /deleteuser username - # 例如:result = user_config_manager.delete_user(username) - result = f"{username}的用户配置已删除" # 模拟结果 - await bot.reply_to(message, result) - - -async def bot_polling(): - # 在无限循环中运行 bot.polling - while True: - try: - print("Bot started") - await bot.polling(none_stop=True, interval=0) - except Exception as e: - print(f"Bot polling failed, retrying in 5 seconds. Error: {e}") - await asyncio.sleep(5) # 如果失败,5秒后重试 - - -async def main(): - # 启动 bot 轮询 - await bot_polling() - - -if __name__ == "__main__": - asyncio.run(main()) diff --git a/common/http_client/async_aiohttp_client.py b/common/http_client/async_aiohttp_client.py index ce218f4..5ac45e5 100644 --- a/common/http_client/async_aiohttp_client.py +++ b/common/http_client/async_aiohttp_client.py @@ -6,9 +6,7 @@ retry, wait_fixed, stop_after_attempt, - retry_if_exception_type, - before_sleep_log, -) + retry_if_exception_type) from loguru import logger diff --git a/common/http_client/async_httpx_client.py b/common/http_client/async_httpx_client.py index 2acb00d..cdbd784 100644 --- a/common/http_client/async_httpx_client.py +++ b/common/http_client/async_httpx_client.py @@ -4,9 +4,7 @@ retry, wait_fixed, stop_after_attempt, - retry_if_exception_type, - before_sleep_log, -) + retry_if_exception_type) # 自定义重试前的回调函数 diff --git a/common/utils/utils.py b/common/utils/utils.py index 34572b9..66a831f 100644 --- a/common/utils/utils.py +++ b/common/utils/utils.py @@ -14,7 +14,7 @@ def extract_keyword_from_url(keyword): query_params = parse_qs(parsed_url.query) # 检查关键参数并返回相应的值 - for key in ["q", "search_word", "query"]: + for key in ["q", "search_word", "query", "keyword"]: if key in query_params: # 通常参数是一个列表,返回第一个值 return query_params[key][0] diff --git a/config/README.md b/config/README.md deleted file mode 100644 index e69de29..0000000 diff --git a/config/example.env b/config/example.env index 14286b7..a883893 100644 --- a/config/example.env +++ b/config/example.env @@ -1,13 +1,15 @@ # 调试模式 +# 如果打开,日志打印为则包含文件名和行号的详细日志格式 # DEBUG=True -# 使用的http客户端类型, 默认aiohttp +# 使用的http客户端类型(aiohttp/httpx), 默认使用aiohttp # HTTP_CLIENT = "aiohttp" -# Http代理 +# Http代理,用于煤炉 telegram等中国大陆无法访问的接口使用 # HTTP_PROXY="http://127.0.0.1:7890" # Telegram BotToken +# 可以定义多个,后缀递增即可 # https://t.me/Samiya310Bot TELEGRAM_BOT_TOKEN_1="" # https://t.me/Lihahadear_bot @@ -19,7 +21,6 @@ WECOM_CORP_SECRET="" # 企业微信应用的AgentID WECOM_AGENT_ID_1="" - # 任你购 # 账号名 RENNIGOU_MAIL = "" @@ -27,4 +28,9 @@ RENNIGOU_MAIL = "" RENNIGOU_PASS = "" # Telegram 反代服务器 -TELEGRAM_API_URL = "https://api.telegram.org/bot{0}/{1}" \ No newline at end of file +# 官方接口 +# TELEGRAM_API_URL = "https://api.telegram.org/bot{0}/{1}" +# 阿里云新加坡 +# TELEGRAM_API_URL = "http://8.222.130.125:8878/bot{0}/{1}" +# Cloudflare反代接口 +# TELEGRAM_API_URL = "https://tg.samiya.pro/bot{0}/{1}" \ No newline at end of file diff --git a/config/hoyoyo.toml b/config/hoyoyo.toml new file mode 100644 index 0000000..bd6b1df --- /dev/null +++ b/config/hoyoyo.toml @@ -0,0 +1,15 @@ +# HOYOYO 配置 +# 官网: https://cn.hoyoyo.com + +# HOYOYO 网站的搜索设置(仅限骏河屋) +[[websites.hoyoyo.searches]] +# 直接将网页上的网址复制即可 +# 必填项,str +keyword = "https://cn.hoyoyo.com/suruga~search.html?keyword=%E5%AE%B6%E5%BA%AD%E6%95%99%E5%B8%AB%E3%83%92%E3%83%83%E3%83%88%E3%83%9E%E3%83%B3REBORN&keys=%E5%AE%B6%E5%BA%AD%E6%95%99%E5%B8%AB%E3%83%92%E3%83%83%E3%83%88%E3%83%9E%E3%83%B3REBORN&lang=ja&category_id=&fykeyid=34416" + +# 通知方式 +# from * to * +# 必填项 +# 第一项 @Samiya310Bot(1),@Lihahadear_bot(2),企业微信Samiya(3) +# 第二项 根据notify中填的ids定 +notify = [1,1] \ No newline at end of file diff --git a/config/lashinbang.toml b/config/lashinbang.toml index e3cbb3d..0ec5336 100644 --- a/config/lashinbang.toml +++ b/config/lashinbang.toml @@ -15,5 +15,9 @@ keyword = "呪術廻戦" notify = [1,2] [websites.lashinbang.searches.filter] -# 排序顺序,可选项,默认最新上架 -sort = "" \ No newline at end of file +# 排序顺序,可选项,默认関連順 +# 関連順 Score,Number18 +# 入荷日(降順)Score,Number7 +# 更新日(降順) Number7,Score +# 発売日(降順) Number3,Score +sort = "" diff --git a/config/mercari.toml b/config/mercari.toml index 8c01ae0..5e494db 100644 --- a/config/mercari.toml +++ b/config/mercari.toml @@ -17,13 +17,15 @@ notify = [1,2] [websites.mercari.searches.filter] # 排除关键词,可选项 exclude_keyword = "" +# 卖家ID,可选项 +sellerId = "" # 商品状态,可选项,默认为on_sale和trading status = "" +# 品类,可选项 +categoryId = "" +# 品牌,可选项 +brandId = "" # 最低价格,可选项 price_min = "" # 最高价格,可选项 -price_max = "" -# 品类,可选项 -category = "" -# 品牌,可选项 -brandId = "" \ No newline at end of file +price_max = "" \ No newline at end of file diff --git a/config/mercari_user.toml b/config/mercari_user.toml index a2498b5..2536972 100644 --- a/config/mercari_user.toml +++ b/config/mercari_user.toml @@ -16,5 +16,5 @@ keyword = '110551852' notify = [1,2] [websites.mercari_user.searches.filter] -# 商品状态,可选项,默认为on_sale和trading -status = "" \ No newline at end of file +# 商品状态(on_sale,trading,sold_out),可选项,默认为 (on_sale,trading) +status = "on_sale,trading,sold_out" \ No newline at end of file diff --git a/config/notify.toml b/config/notify.toml index b1f047e..dd6fed9 100644 --- a/config/notify.toml +++ b/config/notify.toml @@ -8,7 +8,7 @@ user = "Default User" # Telegram 设置 # Telegram用户ID,与企业微信用户ID至少需要一个 # 可通过给 https://t.me/username_to_id_bot 发一条消息来获取 -telegram_chat_ids = [123456,789123] +telegram_chat_ids = ["123456","789123"] # Telegram 消息发送类型 # 可选项:1代表纯文本,2代表文字+图片分开发送,3代表图文一起发送,默认为3 tg_send_type = 3 @@ -37,13 +37,13 @@ user_max_pages = 20 # 可选项,默认值为 0.049 exchange_rate = 0.049 -# 每个搜索任务的最大并发数 +# 每个搜索任务的最大并发数,即动态并发搜索的页数 # 可选项,默认值为10 max_concurrency = 10 # 自定义消息推送模板 # 可选项,有默认模板 -# 可用参数 +# 可用占位符如下: # id 商品ID # imageURL 商品图片链接 # productName 商品名 diff --git a/config/rennigou.toml b/config/rennigou.toml index 8fc5a2b..b7c9904 100644 --- a/config/rennigou.toml +++ b/config/rennigou.toml @@ -7,8 +7,8 @@ keyword = "家庭教師ヒットマンREBORN" # 搜索网站,必填项 -# all, mercari, yahooauction, rakuma, amazon, surugaya -websiteType = "mercari" +# 全部-all, 煤炉-mercari, 雅虎日拍-yahooauction,乐天Rakuma-rakuma,亚马逊Amazon-amazon,骏河屋-surugaya +websiteType = "surugaya" # 通知方式 # from * to * @@ -19,37 +19,52 @@ notify = [1,2] [websites.rennigou.searches.filter] -# 通用过滤器 -# 排序 -# mercari 空/推荐排序 price_asc/价格从低到高 price_desc/价格从高到低 new/上架日期从近到远 like_desc/点赞从多到少 -# surugaya 空/默认 price_asc/价格从低到高 price_desc/价格从高到低 new_publish/新上架商品 publish_time_asc/发售日从近到远 publish_time_desc/发售日从远到近 -sortOrder = "new" - -# 价格 -priceMin = "" -priceMax = "" - -# 煤炉 -# 1/仅看有货 空/全部 -statusOnSale = "" -# 品相成色 1(新品、未使用)/2(几乎未使用)/3(没有明显的划痕或污垢)/4(有一些划痕和污渍)/5(有划痕和污垢)/6(整体状况不佳) 空/全部 -conditionIds = "" -# 2/日本内包邮 空/全部 +# >>>>>>>>>>>>>>煤炉过滤器>>>>>>>>>>>>>>>>>>>>>>> +# 推荐排序-为空,价格降序-price_desc,价格升序-price_asc,上架日期从近到远-new,点赞从多到少-like_desc +sortOrder = "" + +# 仅看有货-勾选为1,不勾选为空 +statusOnSale = "1" + +# 仅看现货-勾选为1,不勾选为空 +haveStock = "1" + +# 日本国内包邮-勾选为2,不勾选为空 shippingPayer = "" -category = "" -brand_ids = "" +# 新品、未使用-1,几乎未使用-2,没有明显的划痕或污垢-3,有一些划痕和污渍-4,有划痕和污垢-5,整体状况不佳-6,全部-空 +conditionIds = "" +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# >>>>>>>>>>>>>>骏河屋过滤器>>>>>>>>>>>>>>>>>>> +# 默认-为空,价格从低到高-price_asc,价格从高到低-price_desc,新上架商品-new_publish,发售日从进到远-publish_time_asc,发售日从远到近-publish_time_desc +sortOrder = "" +# 仅看现货-勾选为1,不勾选为空 +haveStock = "1" -# 骏河屋 -# 1/仅看现货 -haveStock = "" -# 2/新上架 +# 新上架-勾选为2,不勾选为空 newPublish = "" +# 折扣 不限为1,后续递增到7,7为>100% +discountId = "1" + +# 分类,1029/吧唧 +category = "" + # 1/不限 2/新品 3/二手 4/预约 -conditionId = "" -# 1/2/3/4/5/6/7 -discountId = "" -# 1029/吧唧 -category = "" \ No newline at end of file +conditionId = "1" +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + + +# >>>>>>>>>>>>>>雅虎日拍过滤器>>>>>>>>>>>>>>>>>>>>> +# TODO +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# >>>>>>>>>>>>>>>Rakuma过滤器>>>>>>>>>>>>>>>>>>>> +# TODO +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> + +# >>>>>>>>>>>>>>>亚马逊过滤器>>>>>>>>>>>>>>>>>>>>> +# TODO +# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git a/docker-compose.yaml b/docker-compose.yaml index 66a548e..76582bd 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -12,7 +12,6 @@ services: stdin_open: true tty: true command: /data -A -# 爬虫 VintageVigil: image: samiya777/vintagevigil:latest container_name: vintagevigil @@ -24,4 +23,4 @@ services: max-file: "3" volumes: - /root/VintageUser/.env:/root/VintageVigil/.env # 环境变量 - - /root/VintageUser/lx:/root/VintageVigil/user/lx # 配置文件 \ No newline at end of file + - /root/VintageUser/user:/root/VintageVigil/user/user # 配置文件 \ No newline at end of file diff --git a/docker-swarm.yaml b/docker-swarm.yaml new file mode 100644 index 0000000..8067b24 --- /dev/null +++ b/docker-swarm.yaml @@ -0,0 +1,42 @@ +version: '3.7' + +services: + vintagevigil: + image: samiya777/vintagevigil:alpine + environment: + # 配置文件远程仓库路径(如果不填写,请自行映射config至容器内) + - CONFIG_PATH=https://github.com/Samiya321/VintageUser/tree/main/路径 + # 配置文件变化监控间隔(默认60) + # - CHECK_INTERVAL=60 + # 是否使用内置的Github Token来进行请求,否则每分钟只能请求60次/单个IP(默认是) + # - USE_TOKEN=true + deploy: + mode: replicated + replicas: 1 + placement: + constraints: + - node.hostname==节点名 + update_config: + parallelism: 1 + failure_action: pause + monitor: 5s + max_failure_ratio: 0 + order: stop-first + rollback_config: + parallelism: 1 + failure_action: pause + monitor: 5s + max_failure_ratio: 0 + order: stop-first + configs: + - source: env + target: /root/VintageVigil/.env + logging: + driver: json-file + options: + max-size: 20m + max-file: "3" + +configs: + env: + external: true diff --git a/requirements.txt b/requirements.txt index 3c60474..a616467 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,4 +7,5 @@ telebot==0.0.5 toml==0.10.2 python-dotenv==1.0.1 python_jose==3.3.0 -ecdsa==0.18.0 \ No newline at end of file +ecdsa==0.18.0 +brotli==1.1.0 \ No newline at end of file diff --git a/time_test.sh b/time_test.sh index 0eec3eb..904e2c3 100644 --- a/time_test.sh +++ b/time_test.sh @@ -27,8 +27,6 @@ for site_name in "${!urls[@]}"; do total_connect_time=0 total_dns_time=0 total_tls_time=0 - total_redirect_time=0 - total_pretransfer_time=0 total_starttransfer_time=0 total_total_time=0 @@ -40,8 +38,6 @@ for site_name in "${!urls[@]}"; do connect_time=$(echo "$result" | grep "连接时间" | awk '{print $NF}') dns_time=$(echo "$result" | grep "DNS解析时间" | awk '{print $NF}') tls_time=$(echo "$result" | grep "TLS握手时间" | awk '{print $NF}') - redirect_time=$(echo "$result" | grep "重定向时间" | awk '{print $NF}') - pretransfer_time=$(echo "$result" | grep "准备传输时间" | awk '{print $NF}') starttransfer_time=$(echo "$result" | grep "传输开始时间" | awk '{print $NF}') total_time=$(echo "$result" | grep "总时间" | awk '{print $NF}') @@ -49,8 +45,6 @@ for site_name in "${!urls[@]}"; do total_connect_time=$(echo "$total_connect_time + $connect_time" | bc) total_dns_time=$(echo "$total_dns_time + $dns_time" | bc) total_tls_time=$(echo "$total_tls_time + $tls_time" | bc) - total_redirect_time=$(echo "$total_redirect_time + $redirect_time" | bc) - total_pretransfer_time=$(echo "$total_pretransfer_time + $pretransfer_time" | bc) total_starttransfer_time=$(echo "$total_starttransfer_time + $starttransfer_time" | bc) total_total_time=$(echo "$total_total_time + $total_time" | bc) @@ -70,8 +64,6 @@ for site_name in "${!urls[@]}"; do printf "连接时间 平均值: %.4f 秒\n" $average_connect_time printf "DNS解析时间 平均值: %.4f 秒\n" $average_dns_time printf "TLS握手时间 平均值: %.4f 秒\n" $average_tls_time - printf "重定向时间 平均值: %.4f 秒\n" $average_redirect_time - printf "准备传输时间 平均值: %.4f 秒\n" $average_pretransfer_time printf "传输开始时间 平均值: %.4f 秒\n" $average_starttransfer_time printf "总时间 平均值: %.4f 秒\n" $average_total_time echo "-----------------------------------------" diff --git a/website/base/scraper.py b/website/base/scraper.py index 14adccc..5a3db92 100644 --- a/website/base/scraper.py +++ b/website/base/scraper.py @@ -31,7 +31,6 @@ async def search( if max_pages == 0: return # 直接返回,不执行任何任务 - # 限制并发页数 # 确保并发数不超过 MAX_CONCURRENT_PAGES 或 max_pages concurrent_pages = min(search_term["max_concurrency"], max_pages) @@ -45,12 +44,6 @@ async def fetch_with_semaphore(page): tasks = [fetch_with_semaphore(page) for page in range(1, max_pages + 1)] pages_content = await asyncio.gather(*tasks, return_exceptions=True) - # 全并发处理每一页 - # tasks = [ - # self.fetch_products(search_term, page) for page in range(1, max_pages + 1) - # ] - # pages_content = await asyncio.gather(*tasks, return_exceptions=True) - # 遍历每一页的结果 for page_products in pages_content: # 处理或记录异常 跳过空列表 diff --git a/website/base/scraper_mercari.py b/website/base/scraper_mercari.py index e6a01e7..881a54c 100644 --- a/website/base/scraper_mercari.py +++ b/website/base/scraper_mercari.py @@ -64,9 +64,6 @@ async def get_response(self, method, data=None, params=None): self.root_url, params=params, headers=headers ) response.raise_for_status() - # res_headers = response._response.headers.get("Cf-Cache-Status") - # if res_headers != "DYNAMIC": - # pass await response.close() return await response.json() @@ -79,11 +76,12 @@ def create_headers(self, method): "DPoP": self.create_headers_dpop(method), "X-Platform": "web", # mercari requires this header "Accept": "application/json, text/plain, */*", - "Accept-Encoding": "deflate, gzip", + "Accept-Encoding": "gzip, deflate, br", + "Origin": "https://jp.mercari.com", "Content-Type": "application/json; charset=utf-8", # courtesy header since they're blocking python-requests (returns 0 results) "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0", - "Cache-Control": "no-cache", + "Cache-Control": "no-cache, no-store, must-revalidate", "Pragma": "no-cache", } return headers diff --git a/website/hoyoyo.py b/website/hoyoyo.py index af75065..adee05d 100644 --- a/website/hoyoyo.py +++ b/website/hoyoyo.py @@ -7,20 +7,26 @@ def __init__(self, http_client): headers = { "x-requested-with": "XMLHttpRequest", "Host": "cn.hoyoyo.com", + "Cache-Control": "no-cache", + "Pragma": "no-cache", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0", + "Accept": "application/json, text/javascript, */*; q=0.01", + "Accept-Encoding": "gzip, deflate, br", } super().__init__( - base_url="https://cn.hoyoyo.com/suruga~search.html", + base_url="http://cn.hoyoyo.com/suruga~search.html", page_size=24, http_client=http_client, method="GET", headers=headers, ) + async def create_request_url(self, params): + return params,None + async def create_search_params(self, search, page: int) -> dict: - return { - "keyword": search["keyword"], - "page": page, - } + search_url = search["keyword"] + return f"{search_url}&page={page}" async def get_max_pages(self, search) -> int: response = await self.get_response(search, 1) diff --git a/website/jumpshop.py b/website/jumpshop.py index bbae5b6..895cde0 100644 --- a/website/jumpshop.py +++ b/website/jumpshop.py @@ -7,12 +7,13 @@ class JumpShop(BaseScrapy): def __init__(self, http_client): headers = { - "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", - "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.76", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Language:": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6", + "User-Agent:": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0", + "Accept-Encoding": "gzip, deflate, br" } super().__init__( - base_url="https://jumpshop-online.com/search", + base_url="http://jumpshop-online.com/search", page_size=20, headers=headers, http_client=http_client, diff --git a/website/lashinbang.py b/website/lashinbang.py index 07ef849..259b8e4 100644 --- a/website/lashinbang.py +++ b/website/lashinbang.py @@ -4,6 +4,13 @@ class Lashinbang(BaseScrapy): def __init__(self, http_client): + headers = { + "Cache-Control": "no-cache", + "Pragma": "no-cache", + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0", + "Accept": "*/*", + "Accept-Encoding": "gzip, deflate, br", + } super().__init__( base_url="http://lashinbang-f-s.snva.jp", page_size=100, @@ -13,13 +20,14 @@ def __init__(self, http_client): async def create_search_params(self, search, page: int) -> dict: return { - "q": search["keyword"], - "s6o": 1, - "pl": 1, - "sort": getattr(search["filter"], "sort", "Number18%2CScore"), - "limit": self.page_size, - "o": (page - 1) * self.page_size, # Offset calculation for pagination - "n6l": 1, + "q": search["keyword"], # 搜索关键词 + "sort": getattr(search["filter"], "sort", "Number18%2CScore"), # 商品排序方式 + "limit": self.page_size, # 每页返回的商品数量 + "o": (page - 1) * self.page_size, # 偏移值,用于翻页 + "s6o": 1, # TODO + "pl": 1, # TODO + "n6l": 1, # 只看在库有货的商品,如果为0则显示所有商品(包括品切的) + "s1": 2, # 打开全年龄限制 "callback": "callback", "controller": "lashinbang_front", } diff --git a/website/mercari_items.py b/website/mercari_items.py index 76d5ac5..816a47a 100644 --- a/website/mercari_items.py +++ b/website/mercari_items.py @@ -52,8 +52,5 @@ async def get_item_site(self, item): return "mercari_user" async def get_item_status(self, item): - if item.get("status") == "on_sale": - status = 1 - else: - status = 0 - return status + status = 1 if item.get("status") == "on_sale" else 0 + return status \ No newline at end of file diff --git a/website/mercari_search.py b/website/mercari_search.py index 5fb49dc..922c2d3 100644 --- a/website/mercari_search.py +++ b/website/mercari_search.py @@ -84,17 +84,20 @@ def create_data(self, search, page, sort_type): # this is hardcoded in their frontend currently, so leaving it "indexRouting": "INDEX_ROUTING_UNSPECIFIED", "searchCondition": { - "keyword": search["keyword"], - "excludeKeyword": getattr(search["filter"], "exclude_keyword", ""), - "sort": sort_type, - "order": "ORDER_DESC", + "keyword": search["keyword"], # 搜索关键词 + "excludeKeyword": getattr( + search["filter"], "exclude_keyword", "" + ), # 排除关键词 + "sort": sort_type, # 排序方式 + "order": "ORDER_DESC", # 排序顺序 + "sellerId": getattr(search["filter"], "sellerId", []), # 卖家ID "status": getattr( search["filter"], "status", ["STATUS_ON_SALE", "STATUS_TRADING"] - ), - "categoryId": getattr(search["filter"], "category", []), - "brandId": getattr(search["filter"], "brandId", []), - "priceMin": getattr(search["filter"], "price_min", 0), - "priceMax": getattr(search["filter"], "price_max", 0), + ), # 商品售出状态 + "categoryId": getattr(search["filter"], "categoryId", []), # 商品品类 + "brandId": getattr(search["filter"], "brandId", []), # 商品品牌 + "priceMin": getattr(search["filter"], "price_min", 0), # 最低价格 + "priceMax": getattr(search["filter"], "price_max", 0), # 最高价格 }, # I'm not certain what these are, but I believe it's what mercari queries against # this is the default in their site, so leaving it as these 2 @@ -105,8 +108,5 @@ async def get_item_site(self, item): return "mercari" async def get_item_status(self, item): - if item.get("status") == "ITEM_STATUS_ON_SALE": - status = 1 - else: - status = 0 + status = 1 if item.get("status") == "ITEM_STATUS_ON_SALE" else 0 return status diff --git a/website/rennigou.py b/website/rennigou.py index 6e0a3f3..b8f32ee 100644 --- a/website/rennigou.py +++ b/website/rennigou.py @@ -26,39 +26,50 @@ async def async_init(self): async def search( self, search_term, iteration_count, user_max_pages ) -> AsyncGenerator[SearchResultItem, None]: - current_page = 1 + max_concurrency = search_term.get( + "max_concurrency", 20 + ) # 从search_term获取最大并发数,默认为10 + semaphore = asyncio.Semaphore(max_concurrency) # 使用信号量来限制并发数量 - # 限制并发页数 - concurrent_pages = search_term["max_concurrency"] + async def fetch_page(page_number): + async with semaphore: + return await self.fetch_products(search_term, page_number) - while True: - tasks = [] - for _ in range(concurrent_pages): - # 根据iteration_count和user_max_pages决定是否继续添加任务 - if ( - iteration_count != 0 and current_page > user_max_pages - ) or not self.has_next: - break - tasks.append(self.fetch_products(search_term, current_page)) - current_page += 1 - - if not tasks: # 如果没有任务需要执行,则退出循环 - break + current_page = 1 + tasks = [] - current_page_contents = await asyncio.gather(*tasks, return_exceptions=True) + # 当has_next为真且未达到iteration_count指定的页数限制时,继续创建任务 + while self.has_next and ( + iteration_count == 0 or current_page <= user_max_pages + ): + if iteration_count != 0 and current_page > user_max_pages: + break # 达到user_max_pages限制时停止创建新任务 + + # 创建任务,直到达到并发限制或没有更多页面需要请求 + while ( + len(tasks) < max_concurrency + and (iteration_count == 0 or current_page <= user_max_pages) + and self.has_next + ): + tasks.append(fetch_page(current_page)) + current_page += 1 - for page_content in current_page_contents: - if isinstance(page_content, (Exception, BaseException)): - continue # 处理或记录异常 + # 使用asyncio.gather等待所有当前任务完成 + results = await asyncio.gather(*tasks, return_exceptions=True) + tasks = [] # 重置任务列表以便下一批任务的创建 - if page_content is None or not self.has_next: + # 处理结果 + for page_content in results: + if page_content is None: self.has_next = False - break # 如果没有下一页,则停止抓取 - + break for product in page_content: yield product - if not self.has_next: + # 检查是否继续创建新任务 + if not self.has_next or ( + iteration_count != 0 and current_page > user_max_pages + ): break self.has_next = True # 重置 has_next 以供下次搜索使用 @@ -87,6 +98,8 @@ def create_headers(self): "Authorization": f"Bearer {self.create_jwt_token()}", "uid": self.uid, "token": self.token, + "Accept-Encoding": "gzip, deflate, br", + "Accept": "application/json, text/plain, */*", } return True diff --git a/website/suruga.py b/website/suruga.py index 3138b1e..31cd875 100644 --- a/website/suruga.py +++ b/website/suruga.py @@ -1,4 +1,5 @@ from parsel import Selector +from urllib.parse import urlparse, parse_qs from .base.common_imports import * from .base.scraper import BaseScrapy @@ -10,8 +11,9 @@ def __init__(self, http_client): headers = { "Cache-Control": "no-cache", "Pragma": "no-cache", - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0", - "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "Accept-Encoding": "gzip, deflate, br", } super().__init__( base_url="http://www.suruga-ya.jp/search", @@ -28,26 +30,29 @@ async def create_request_url(self, params): async def create_search_params(self, search, page: int) -> dict: # 判断搜索关键字是否为URL is_url = "https" in search["keyword"] - get_param = ( - ( - lambda param, default="": self.get_param_value(search["keyword"], param) - or default - ) - if is_url - else lambda param, default="": default - ) + if is_url: + # 解析URL并提取查询参数 + parsed_url = urlparse(search["keyword"]) + query_params = parse_qs(parsed_url.query) - return { - "category": get_param("category") if is_url else "", # カテゴリー - "search_word": get_param("search_word") if is_url else search["keyword"], - "rankBy": get_param("rankBy", "modificationTime:descending") - if is_url - else "modificationTime:descending", # 並べ替え - "hendou": get_param("hendou") if is_url else "", # 変動 - "page": page, - "adult_s": get_param("adult_s", 1) if is_url else 1, # セーフサーチ - "inStock": get_param("inStock", "Off") if is_url else "Off", # 品切れ - } + # 将查询参数转换为字典,取每个查询参数的第一个值 + params = {k: v[0] for k, v in query_params.items()} + + # 添加或修改页码参数 + params["page"] = page + + return params + else: + # 使用默认字典,只修改search_word和page + return { + "category": "", # 品类 + "search_word": search["keyword"], # 搜索关键词 + "rankBy": "modificationTime:descending", # 排序顺序 + "hendou": "", # 変動 + "page": page, # 页数 + "adult_s": 1, # 是否开启成人项 + "inStock": "Off", # 是否显示缺货商品,默认不显示 + } async def get_max_pages(self, search) -> int: res = await self.get_response(search, 1)