-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathspider.py
204 lines (184 loc) · 5.97 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import os
import requests
import time
from requests import RequestException
import json
from pyquery import PyQuery as pq
from pymongo import MongoClient
from config import *
from multiprocessing.pool import Pool
client = MongoClient(MONGODB_HOST, MONGODB_PORT)
db = client[MONGODB_DB]
switch_col = {
1: MONGODB_COLLECTION1,
2: MONGODB_COLLECTION2,
3: MONGODB_COLLECTION3
}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/68.0.3440.106 Safari/537.36 '
}
def get__page_index(page, _type):
"""
获得列表页json
:param page:
:param _type:
:return:
"""
base_url = "https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable"
switch = {
1: '100,200,201,202,203,204,205,206,207,309,400,401,402,3091,4001',
2: '301,303',
3: '300,302,304,3041,305,306,307,308'
}
params = {
'pi': page,
'ps': 20,
'timestamp': round(time.time() * 1000),
'type': switch.get(_type)
}
response = requests.get(base_url, params=params, headers=headers)
try:
if response.status_code == 200:
return response.text
return None
except RequestException:
print("Request list json error")
return None
def parse_page_index(text):
"""
解析列表页的json内容
:param text:
:return:
"""
data = json.loads(text)
if data and 'notices' in data.keys():
for item in data.get('notices'):
yield item.get('id')
def get_page_detail(_id):
"""
获得公告页面json
:param _id:
:return:
"""
url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable/' + _id
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
except RequestException:
print('Request notification page error')
def parse_page_detail(text):
"""
解析公告页面json
:param text:
:return:
"""
data = json.loads(text)
if data and 'notice' in data.keys():
# get()返回的类型就是键对应的值的类型
notice = data.get('notice')
html = notice.get('html')
doc = pq(html)
doc.find('style').remove()
return {
'agentName': notice.get('agentName'),
'bidBeginTime': notice.get('bidBeginTime'),
'bidEndTime': notice.get('openBidTime'),
'buyerName': notice.get('buyerName'),
'creatorOrgName': notice.get('creatorOrgName'),
'districtName': notice.get('districtName'),
'issueTime': notice.get('issueTime'),
'projectBudget': notice.get('projectBudget'),
'projectDirectoryName': notice.get('projectDirectoryName'), # 类别
'projectName': notice.get('projectName'),
'projectPurchaseWayName': notice.get('projectPurchaseWayName'),
'title': notice.get('title'),
'content': doc.text(),
'attachments': notice.get('attachments') # 不一定有
}
def save_attachment(attachment):
"""
保存附件到本地
:param attachment:
"""
name = attachment.get('name')
value = attachment.get('value')
file_path = os.getcwd() + '/attachments/' + value
dir_path = os.path.split(file_path)[0]
if not os.path.exists(dir_path):
os.makedirs(dir_path)
try:
if not os.path.exists(file_path): # 不存在才请求
content = download_attachment(name, value)
if content:
with open(file_path, 'wb') as f:
f.write(content)
print(name, "was successfully saved in", file_path)
else:
print(name, "has already been saved in", file_path)
except IOError:
print("IOError")
def download_attachment(name, value):
"""
请求附件下载地址
:param name:
:param value:
:return:
"""
print("Downloading attachment", name)
base_url = 'https://www.cqgp.gov.cn/gwebsite/files'
params = {
'filePath': value,
'fileName': name
}
try:
response = requests.get(base_url, params=params, headers=headers)
if response.status_code == 200:
return response.content
print(name, "Request attachment failed", response.url)
return None
except RequestException:
print(name, "Request attachment failed", response.url)
return None
def save_to_mongodb(_type, result):
"""
保存文档到MongoDB
:param _type:
:param result:
"""
col = db[switch_col[_type]]
if not col.find_one({'_id': result['_id']}):
if col.insert_one(result).inserted_id:
print('Successfully Saved to mongodb', result)
else:
print('save to mongodb failed')
else:
print(result['title'], "is already in mongodb")
def main(params):
_type = params[0]
page = params[1]
text = get__page_index(page, _type)
if text:
for _id in parse_page_index(text):
text = get_page_detail(_id)
if text:
result = parse_page_detail(text)
attachments = result.get('attachments')
if attachments:
attachments = eval(attachments) # 转list
result['attachments'] = attachments
for attachment in attachments:
save_attachment(attachment)
result['_id'] = _id
save_to_mongodb(_type, result)
if __name__ == '__main__':
groups = []
for i in range(1, TYPE_COUNT + 1):
for j in range(1, MAX_PAGE + 1):
groups.append([i, j])
print(groups)
pool = Pool()
pool.map(main, groups)
pool.close()
pool.join()