-
Notifications
You must be signed in to change notification settings - Fork 0
/
page_parsing.py
54 lines (40 loc) · 1.5 KB
/
page_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
''' Author- FGV587 '''
from bs4 import BeautifulSoup
import requests
import time
import pymongo
import re
client = pymongo.MongoClient('localhost',27017)
ceshi = client['ceshi']
url_list = ceshi['url_list4']
item_info= ceshi['item_info4']
# spider 1
def get_links_from(channel,pages,who_sells=0):
list_view = '{}{}/pn{}/'.format(channel,str(who_sells),str(pages))
web_data = requests.get(list_view)
time.sleep(0.5)
soup = BeautifulSoup(web_data.text,'lxml')
p = re.compile('http://bj.58.com/')
if soup.find('td','t'):
for link in soup.select('td.t a.t'):
item_link = link.get('href').split('?')[0]
if p.search(item_link) == None:
pass
else:
url_list.insert_one({'url':item_link})
print(item_link)
else:
pass
def get_item_info(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text,'lxml')
no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/')
if no_longer_exist:
pass
else:
title = soup.title.text
price = soup.select('span.price.c_f50',)[0].text
date = soup.select('.time')[0].text
area = list(soup.select('.c_25d a'))[1].stripped_strings if soup.find_all('span','c_25d') else None
item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})
print({'title': title, 'price': price, 'date': date, 'area': area, 'url': url})