-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_playwright.py
144 lines (122 loc) · 4.99 KB
/
get_playwright.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import asyncio
from playwright.async_api import async_playwright
timeout = 60 * 1000 * 3
async def click_button(el, n):
for i in el:
text = await i.text_content()
try:
pageNumber = int(text.strip())
if pageNumber == n:
await i.click()
return True
except ValueError as e:
pass
async def callback(page):
el = await page.query_selector_all("#list_videos_favourite_videos div.detail")
data = []
for i in el:
titleEl = await i.query_selector(".title a")
subTitleEl = await i.query_selector(".sub-title")
title = await titleEl.text_content()
url = await titleEl.get_attribute("href")
av_id = url.split("?")[0].split("/")[-2]
text = await subTitleEl.text_content()
view = text.strip().split("\n")[0].replace(" ", "")
count = text.strip().split("\n")[1].replace(" ", "")
data.append(
{"title": title, "url": url, "av_id": av_id, view: view, count: count}
)
return data
async def recursion_find_button(page, n=1, sleep=3500, callback=callback, data=[]):
data = [*data, *(await callback(page))]
el = await page.query_selector_all(".page-item")
flag = await click_button(el, n + 1)
if not flag:
return data
await page.wait_for_timeout(sleep)
return await recursion_find_button(page, n + 1, data=data)
async def jable_favourite_playwright(
url="https://jable.tv/members/297827/", localCount=None, headless=True
):
async with async_playwright() as p:
for browser_type in [
p.firefox
]: # p.chromium, 用chromium会被检测到, firefox不会
browser = await browser_type.launch(headless=headless)
page = await browser.new_page()
await page.goto(url, timeout=timeout)
countEl = await page.wait_for_selector(".count", state="attached")
count = int(await countEl.text_content())
if localCount != None:
if int(localCount) == int(count):
print(f"favourite,检测到数目一致,已跳过,{count}")
return
data = await recursion_find_button(page)
assert count == len(
data
), f"数目不对,请检查,预期数目:{count}, 实际数目:{len(data)}"
return data
async def operate_jable_playwright(url, headless=False):
async with async_playwright() as p:
for browser_type in [
p.firefox
]: # p.chromium, 用chromium会被检测到, firefox不会
browser = await browser_type.launch(
headless=headless,
# executablePath="C:\\Users\\hxse\\AppData\\Local\\ms-playwright\\firefox-1335\\firefox\\firefox.exe",
)
page = await browser.new_page()
await page.goto(url, timeout=timeout)
el = await page.query_selector("div.info-header")
import pdb
pdb.set_trace()
titleEl = await el.query_selector(".header-left h4")
countEl = await page.query_selector(".count")
viewEl = await el.query_selector_all("span.mr-3")
modelsEl = await el.query_selector_all(".models .model")
tagsEl = await page.query_selector_all(".tags a")
title = await titleEl.text_content()
count = await countEl.text_content()
view = await viewEl[1].text_content()
title, count, view = (
title.strip(),
count.strip(),
view.replace(" ", "").strip(),
)
models = []
for i in modelsEl:
href = await i.get_attribute("href")
mEl = await i.query_selector("img")
if mEl == None: # 模特有两种格式, 一种是img格式, 一种是span格式
mEl = await i.query_selector("span")
name = await mEl.get_attribute("data-original-title")
models.append({"href": href, "title": name})
tags = []
for i in tagsEl:
tag = await i.text_content()
href = await i.get_attribute("href")
tags.append({"tag": tag, "href": href})
hsl = await page.evaluate("hlsUrl")
obj = {
"title": title,
"av_id": url.split("?")[0].split("/")[4],
"url": url,
"hsl": hsl,
"count": count,
"view": view,
"models": models,
"tags": tags,
}
# await page.screenshot(path=f"example-{browser_type.name}.png")
# await browser.close()
return obj
if __name__ == "__main__":
"""
"pycryptodome",
"cloudscraper==1.2.58",
"playwright-stealth==1.0.5",
"playwright==1.32.1",
"""
url = "https://jable.tv/videos/ipx-252-c/"
obj = asyncio.run(operate_jable_playwright(url, headless=True))
print(obj)