-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrape.py
88 lines (77 loc) · 2.4 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from bs4 import BeautifulSoup
from datetime import datetime
from io_tools import outCsv
import requests
import json
import re
import time
URL = 'http://anitsayac.com/'
def main():
date = datetime.strftime(datetime.now(), '%Y%m%d')
cache_dict = get_cache('anitsayac_cache.json')
res = requests.get(URL)
soup = BeautifulSoup(res.content, features='lxml')
feminisits = []
for feminisit in soup.findAll('span', {'class':'xxy'}):
a = feminisit.findChild('a')
isim = a.getText()
link = a.get('href')
feminisits.append({'isim':isim, 'link':URL+link})
with open('ref.json', 'w') as out:
json.dump(feminisits, out, indent=2)
feminisit_veri = []
count = 0
for f in feminisits:
if f['link'] in cache_dict:
veri = cache_dict[f['link']]
else:
print(f['isim'], 'not in cache')
veri = get_data(f['link'])
time.sleep(0.5)
if veri:
veri.update(f)
feminisit_veri.append(veri)
count += 1
if count%50 == 0:
print(count,'/',len(feminisits))
with open('anitsayac_cache.json', 'w') as out:
json.dump(feminisit_veri, out, indent=2)
with open('anitsayac_%s.json'%date, 'w') as out:
json.dump(feminisit_veri, out, indent=2)
keys = feminisit_veri[0].keys()
outCsv(keys, feminisit_veri, 'anitsayac_%s.csv'%date)
def get_cache(filename):
cache = json.load(open(filename))
cache_dict = {}
for c in cache:
cache_dict[c['link']] = c
return cache_dict
def get_data(url):
res = requests.get(url)
try:
soup = BeautifulSoup(res.content, features='lxml')
except Exception as e:
print(url)
print(e)
return {}
return parse_data(soup)
def parse_data(soup):
body = soup.find('body')
body_html = str(body)
keys = [b.getText() for b in body.findAll('b')]
q = '(\<b\>%s\<\/b\>)(.+?)\<'
data = {}
for key in keys:
m = re.search(q%key, body_html)
if not m:
print(body_html)
return {}
data[key.strip()] = m.groups()[1].strip()
# simdilik sadece tek kaynak URL aliyor
if body.find('a'):
data['Kaynak:'] = body.find('a').get('href')
if body.find('img'):
data['Gorsel:'] = body.find('img').get('src')
return data
if __name__ == "__main__":
main()