forked from linpingta/lianjia-eroom-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
eroom_finder.py
171 lines (150 loc) · 6.04 KB
/
eroom_finder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
# vim: set bg=dark noet ts=4 sw=4 fdm=indent :
""" DESCRIPTION OF WORK"""
__author__ = "linpingta"
import argparse
import logging
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import math
import requests
import lxml
import re
import time
import datetime
import traceback
def re_match(re_pattern, string, errif=None):
try:
return re.findall(re_pattern, string)[0].strip()
except IndexError:
return errif
def get_house_info(start_url, sess):
html = sess.get(start_url).text
house_num = re.findall('共找到<span> (.*?) </span>套.*二手房', html)[0].strip()
return house_num
def get_info_dic(info, area, city_name):
info_dic = {
'area': area,
'title': re_match('target="_blank">(.*?)</a><!--', str(info)),
'community': re_match('xiaoqu.*?target="_blank">(.*?)</a>', str(info)),
'position': re_match('<a href.*?target="_blank">(.*?)</a>.*?class="address">', str(info)),
'tax': re_match('class="taxfree">(.*?)</span>', str(info)),
'total_price': float(re_match('class="totalPrice totalPrice2"><i> </i><span class="">(.*?)</span><i>万', str(info))),
'unit_price': float(re_match('data-price="(.*?)"', str(info))),
}
hhid = re.findall('data-housecode="(.*?)"', str(info))[0]
info_dic.update({
'hhid': hhid,
'link': f'https://{city_name}.lianjia.com/ershoufang/{hhid}.html',
})
icons = re.findall('class="houseIcon"></span>(.*?)</div>', str(info))[0].strip().split('|')
info_dic.update({
'hourseType': icons[0].strip(),
'hourseSize': float(icons[1].replace('平米', '')),
'direction': icons[2].strip(),
'fitment': icons[3].strip(),
'level': icons[4].strip(),
'buildTime': icons[5].strip(),
})
return info_dic
filter = ''
def crawl_data(sess, real_dict, city_name):
total_num = 0
err_num = 0
data_info_list = []
url = 'https://%s.lianjia.com/ershoufang/{}/pg{}{}/' % city_name
for key_, value_ in real_dict.items():
start_url = ('https://%s.lianjia.com/ershoufang/{}/{}' % city_name).format(value_, filter)
house_num = get_house_info(start_url, sess)
print('{}: 二手房源共计「{}」套'.format(key_, house_num))
time.sleep(2)
total_page = int(math.ceil(min(3000, int(house_num)) / 30.0))
for i in tqdm(range(total_page), desc=key_):
html = sess.get(url.format(value_, i+1, filter)).text
soup = BeautifulSoup(html, 'lxml')
info_collect = soup.find_all(class_="info clear")
for info in info_collect:
try:
info_dic = get_info_dic(info, key_, city_name)
data_info_list.append(info_dic)
except Exception as e:
traceback.print_exc()
print("icons <= 5 means not house, but car position")
err_num += 1
total_num += 1
print("after crawl, total_num[%s] err_num[%s]" % (total_num, err_num))
return data_info_list
def main(city_name, area_kind):
logging.basicConfig(
level=logging.DEBUG,
format='[%(filename)s:%(lineno)s - %(funcName)s %(asctime)s;%(levelname)s] %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S'
)
logger = logging.getLogger(__file__)
example_word = """
DESCRIBE ARGUMENT USAGE HERE
python main.py --help
"""
area_dic = {}
area_dic_small = {}
if city_name == "bj":
# all beijing
area_dic = {'朝阳区': 'chaoyang',
'海淀区': 'haidian',
'西城区': 'xicheng'
}
area_dic_small = {
'五道口': 'wudaokou',
}
elif city_name == "hz":
area_dic = {
'钱塘区': 'qiantangqu'
}
area_dic_small = {
# define as real need
}
elif city_name == "su":
area_dic = {
# '相城': 'xiangcheng',
'姑苏': 'gusu',
# '园区': 'gongyeyuan',
# '吴中': 'wuzhong'
}
area_dic_small = {
# 吴中
'城南': 'chengnan1',
'郭巷': 'guoxiang',
'尹山湖': 'yinshanhu',
'越溪': 'yuexi',
# 园区
'娄葑': 'loufeng1',
# 相城
'元和': 'yuanhe',
'渭塘': 'weitang',
'高铁新城': 'gaotiexincheng'
}
else:
print("no area dic defined in city:%s, fill it first" % city_name)
exit(1)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
'Referer': 'https://%s.lianjia.com/ershoufang/' % city_name}
sess = requests.session()
sess.get('https://%s.lianjia.com/ershoufang/' % city_name, headers=headers)
real_dict = area_dic
if area_kind == 'small':
real_dict = area_dic_small
data_info_list = crawl_data(sess, real_dict, city_name)
data = pd.DataFrame(data_info_list)
#data.to_csv("eroom_time__%s_detail__%s__area_%s.csv" % (datetime.datetime.now().strftime('%Y%m%d'), int(time.time()), len(area_dic.values())), encoding='utf-8-sig')
return data
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog=__file__, description='code description', epilog=example_word,
formatter_class=argparse.RawDescriptionHelpFormatter)
# add parameter if needed
parser.add_argument('-v', '--version', help='version of code', action='version', version='%(prog)s 1.0')
parser.add_argument('--area_level', help='level of area to fetch', type=str, default='big')
parser.add_argument('--city_name', help='city to fetch', type=str, default='bj')
args = parser.parse_args()
main(args.city_name, args.area_level)