-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler_JWC(XXMH).py
235 lines (199 loc) · 11.3 KB
/
crawler_JWC(XXMH).py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
# 需要下载 Chromedriver, 版本号需和本地Chrome浏览器版本匹配, 并安装到Chrome路径下以及python运行程序路径下面
# 还需改进的地方是:模拟自动登录时需要打开浏览器页面
import sqlite3
import requests
import re
from bs4 import BeautifulSoup
import bs4
from random import randint
import time
import schedule
from selenium import webdriver
from selenium.webdriver import ChromeOptions
from selenium.webdriver.chrome.options import Options
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
def doSth():
# 设置代理、头部、以及随机更换的用户代理,进而反爬虫
appKey = "R0owSVhtV1pNaVBhZFNDUDpOVGhydzdyWEJqRnhsemNI"
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'webvpn.bupt.edu.cn',
'Referer': 'https://webvpn.bupt.edu.cn/http/77726476706e69737468656265737421fdee0f9e32207c1e7b0c9ce29b5b/index.portal',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
headers2 = {
'Proxy-Authorization': 'Basic ' + appKey,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Host': 'webvpn.bupt.edu.cn',
'Referer': 'https://webvpn.bupt.edu.cn/http/77726476706e69737468656265737421fdee0f9e32207c1e7b0c9ce29b5b/index.portal',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
]
# 利用selenium自动登录,进入信息门户后返回有效cookie
#response = webdriver.Chrome(executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver')
response = webdriver.Chrome()
response.get( 'https:/webvpn.bupt.edu.cn/login')
response.find_element_by_name('username').send_keys('**********')
response.find_element_by_name('password').send_keys('******')
response.find_element_by_class_name('el-button-login').click()
response.implicitly_wait(10)
# 若账号已经登录,网站会弹出是否要踢掉继续登录,因此需要判断是否有弹窗弹出
from selenium.common.exceptions import NoSuchElementException
try:
element = response.find_element_by_id('layui-layer1')
except NoSuchElementException: # 没有弹出
pass
else: # 弹出
response.find_element_by_xpath("//a[@class='layui-layer-btn0']").click()
# 登录后进入新网站,需要将respense对象指向新窗口
response.implicitly_wait(10)
response.switch_to_window(response.window_handles[-1])
# 点击信息门户按钮
element2 = response.find_element_by_class_name('vpn-content-block-panel__content').click()
# 信息门户登录界面,填写用户名和密码
response.implicitly_wait(10)
response.switch_to_window(response.window_handles[-1])
response.find_element_by_name('username').send_keys('**********')
response.find_element_by_name('password').send_keys('******')
response.find_element_by_xpath("//input[@type='submit']").click()
# 信息门户界面,提取Cookie
response.implicitly_wait(10)
response.switch_to_window(response.window_handles[-1])
webCookies = response.get_cookies()
# 关掉界面
response.close()
# 将获取的cookie转为字典,得到正确的cookie
webVpncookies = {}
for item in webCookies:
webVpncookies[item['name']] = item['value']
cookies = {'refresh': '1'}
cookies.update(webVpncookies)
print(cookies)
# 打开信息门户最新通知的网址
url = 'https://webvpn.bupt.edu.cn/http/77726476706e69737468656265737421fdee0f9e32207c1e7b0c9ce29b5b/index.portal?.pn=p1778'
response2 = requests.get(url, headers=headers, cookies=cookies) #发出请求
# print(response2.status_code) # 打印http请求的状态码,返回200代表请求成功
# print(response2.text)
# 利用 BeautifulSoup 提取网站信息
soup = BeautifulSoup(response2.text, 'lxml')
i = 1 # 记录提取的通知数量,并按照"i.txt"形式存储下来
#连接并更新数据库
db = sqlite3.connect('test.db' )
cursor = db.cursor()
#cursor.execute('create table news_JWC(title varchar(200) primary key,url varchar(200))')
cursor.execute('delete from news_JWC')
# 查找通知条目所在的父类标签
for t in soup.find('ul', class_="newslist list-unstyled").children:
# 判断是否为有效标签
if isinstance(t, bs4.element.Tag):
# 通知在<a>......</a>里
s = t.find('a')
# 将链接以及名称存储在文件里
file = 'test.txt'
with open(file, 'a', encoding="utf-8") as data:
data.write(str(i))
data.write('\n')
data.write(s.string)
data.write('\n')
data.write('https://webvpn.bupt.edu.cn/http/77726476706e69737468656265737421fdee0f9e32207c1e7b0c9ce29b5b/')
data.write(s.get('href'))
data.write('\n')
# 每个通知具体的目录存放在 href属性里
print(s.get('href'))
# 提取十条最新通知
if i<11:
url_new = "https://webvpn.bupt.edu.cn/http/77726476706e69737468656265737421fdee0f9e32207c1e7b0c9ce29b5b/" + s.get('href')
headers2['Connection'] = 'keep-alive'
# 随机设置请求头的用户代理
random_agent = USER_AGENTS[randint(0, len(USER_AGENTS) - 1)]
headers2['User-Agent'] = random_agent
# ip代理服务器地址
ip_port = 'secondtransfer.moguproxy.com:9001'
proxy = {"http": "http://" + ip_port, "https": "https://" + ip_port}
while True: # 一直循环,直到访问站点成功
try:
# 以下except都是用来捕获当requests请求出现异常时,
# 通过捕获然后等待网络情况的变化,以此来保护程序的不间断运行
r = requests.get(url_new, headers=headers2, timeout=None, proxies=proxy, verify=False,
allow_redirects=False)
break
except requests.exceptions.ProxyError:
print('ProxyError -- please wait 3 seconds')
time.sleep(3)
except requests.exceptions.ChunkedEncodingError:
print('ChunkedEncodingError -- please wait 3 seconds')
time.sleep(3)
except:
print('Unfortunitely -- An Unknow Error Happened, Please wait 3 seconds')
time.sleep(3)
while r.status_code != 200:
time.sleep(3)
r = requests.get(url_new, headers=headers2, cookies=cookies, timeout=None, proxies=proxy,
verify=False, allow_redirects=False)
'''#创建"i.txt"文件
i_str = str(i)
filename = i_str + '.txt'
# while open the file, let the file decode in utf-8
f = open(filename, 'w', encoding="utf-8")
f.write(r.text)
f.close'''
r.encoding = 'utf-8'
print(r)
#获取标题及日期
title_begin = r.text.find("<h1 class")
title_end = r.text.find("</h1>", title_begin)
title = r.text[title_begin + 24: title_end]
date_begin = r.text.find("发布时间")
date_end = r.text.find("</span>", date_begin)
date = r.text[date_begin + 6: date_end-5]
title_new = title + "[" + date + "]"
#上传至数据库
sql = "INSERT INTO news_JWC(title, url) VALUES ('" + title_new + "', '" + url_new+"')"
cursor.execute(sql)
db.commit()
# 断开新闻通知的连接,并休息50s,防止反爬虫
headers2['Connection'] = 'false'
time.sleep(10)
i = i+1
db.close()
schedule.every().day.at("08:00").do(doSth)
while True:
schedule.run_pending()
#doSth()
time.sleep(60)