-
Notifications
You must be signed in to change notification settings - Fork 0
/
SpiderTest.py
49 lines (44 loc) · 1.59 KB
/
SpiderTest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import urllib.request
import urllib.parse
import re
import os
#添加header,其中Referer必须传,否则会403,User-Agent必须,模仿浏览器访问
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
'referer':'https://image.baidu.com'
}
url = "https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1577271115837_R&pv=&ic=0&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&sid=&word={word}"
keyword = input("请输入关键字:")
#转码
keyword = urllib.parse.quote(keyword,'utf-8')
n = 0
j = 1
while(n<300):
n+=30;#防止拿到重复图片
url1 = url.format(word=keyword)
#获取请求
rep = urllib.request.Request(url1,headers=header)
#打开网页
rep = urllib.request.urlopen(rep)
try:
#获取网页内容
html = rep.read().decode('utf-8')
except:
print("出错了!")
error = 1
print("出错页数:"+str(n))
pass
#匹配图片正则
p = re.compile(r"thumbURL.*?\.jpg")
#h获取正则匹配到的数据,返回list
s = p.findall(html)
if os.path.isdir("D://test_pic") != True:
os.makedirs("D://test_pic")
#获取图片
for i in s:
i = i.replace('thumbURL":"','')
print(i)
#保存图片
urllib.request.urlretrieve(i, r"D://test_pic/pic{num}.jpg".format(num=j))
j+=1
print("总共图片:" + str(j-1))