-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
80 lines (60 loc) · 2.76 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from bs4 import BeautifulSoup
from urllib import parse, request
from datetime import date
import re
class Scraper():
def __init__(self, index='http://www.reachoutberlin.de/modules.php?op=modload&name=topics&file=index&cm=9&cb=8'):
parsed_url = parse.urlparse(index)
self.start = request.urlopen(index)
self.base_url = parsed_url.scheme + "://" + parsed_url.netloc
# dates are a bit dificult; usually they're formatted like YYYY-MM-DD,
# followed by a space character, but sometimes the day is missing or it's
# followed by another character…
self.date_matcher = re.compile('^(\d{4})-(\d{,2})(-(\d{,2}))?')
def get_next_page(self, document):
nav_elem = document.select('.nav')[1]
if nav_elem.get_text().strip() == '>':
href = nav_elem.get('href')
return BeautifulSoup(request.urlopen(href))
else:
return None
def get_articles_on_page(self, document):
article_tables = document.select('table[width="98%"]')
articles = []
for table in article_tables:
# headlines are always YYYY-MM-DD? Berlin-DISTRICT
# sometimes they use Berlin followed by a space, usually by a dash;
# additionally maybe there is some information such as a
# train or bus station appended but often there isn't.
headline = table.select('tr:first-child')[0].get_text()
date_match = self.date_matcher.match(headline.strip())
year, month, day = date_match.group(1,2,4)
place = headline[headline.find(' ') + 1:]
text = table.select('tr')[2].select('td')[1].get_text()
article = {
'date': date(int(year), int(month), int(day) if day else 1),
'month_only': day is None,
'place': place.strip(),
'description': text.strip()
}
articles.append(article)
return articles
def get_yearly_overviews(self):
document = BeautifulSoup(self.start)
links = document.find_all('a')
overviews = []
for link in links:
if link.get_text().lower().startswith('chronik'):
overview_link = link.get('href')
overviews.append(parse.urljoin(self.base_url, overview_link))
return overviews
def scrape(self):
overview_urls = self.get_yearly_overviews()
articles = []
for url in overview_urls:
current_doc = BeautifulSoup(request.urlopen(url))
while current_doc:
new_articles = self.get_articles_on_page(current_doc)
articles.extend(new_articles)
current_doc = self.get_next_page(current_doc)
return articles