From a11f65ef11496b81f6d30990979cb31cdc8df542 Mon Sep 17 00:00:00 2001 From: Yuukiy <76897913+Yuukiy@users.noreply.github.com> Date: Mon, 2 Oct 2023 14:31:25 +0800 Subject: [PATCH] =?UTF-8?q?javbus:=20=E5=BA=94=E5=AF=B9=E6=8A=93=E5=8F=96?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=97=B6=E8=A2=AB=E9=87=8D=E5=AE=9A=E5=90=91?= =?UTF-8?q?=E5=88=B0=E7=99=BB=E5=BD=95=E9=A1=B5=E7=9A=84=E6=83=85=E5=BD=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- web/javbus.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/web/javbus.py b/web/javbus.py index 2f2759cd3..3d64ac4df 100644 --- a/web/javbus.py +++ b/web/javbus.py @@ -31,9 +31,17 @@ def parse_data(movie: MovieInfo): if resp.status_code == 404: raise MovieNotFoundError(__name__, movie.dvdid) resp.raise_for_status() - html = resp2html(resp) + # 疑似JavBus检测到类似爬虫的行为时会要求登录,不过发现目前不需要登录也可以从重定向前的网页中提取信息 + if resp.history and resp.history[0].status_code == 302: + html = resp2html(resp.history[0]) + else: + html = resp2html(resp) + # 引入登录验证后状态码不再准确,因此还要额外通过检测标题来确认是否发生了404 + page_title = html.xpath('/html/head/title/text()') + if page_title and page_title[0].startswith('404 Page Not Found!'): + raise MovieNotFoundError(__name__, movie.dvdid) - container = html.xpath("/html/body/div[@class='container']")[0] + container = html.xpath("//div[@class='container']")[0] title = container.xpath("h3/text()")[0] cover = container.xpath("//a[@class='bigImage']/img/@src")[0] preview_pics = container.xpath("//div[@id='sample-waterfall']/a/@href")