-
Notifications
You must be signed in to change notification settings - Fork 0
/
websiteScrape_lab8.py
40 lines (33 loc) · 1.18 KB
/
websiteScrape_lab8.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import urllib2
webpage = urllib2.urlopen("http://www.amnh.org/our-research/richard-gilder-graduate-school/academics-and-research/seminars-and-conferences")
weblines = webpage.readlines()
webpage.close()
import re
countTalk = 0
amnhTalk = 0
for i in range(len(weblines)):
line = weblines[i]
date = re.search("\d\d\-(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\-\d\d",line)
if date != None:
countTalk += 1
#print weblines[i:i+3]
dateStr = date.group()
#print dateStr
newDate = re.sub("\-16"," 2016",dateStr)
newDate = re.sub("\-"," ",newDate)
print newDate
#print weblines[i+2]
aff = weblines[i+2].find('''top">''')
name = weblines[i+2][aff+5:].strip()
print name
if name.find("American Museum") != -1:
amnhTalk += 1
title = weblines[i+4].find('''top">''')
talk = weblines[i+4][title+5:].strip()
talk = re.sub("(<em>|</em>)","",talk)
talk = re.sub(""",'''"''',talk)
print talk
#title = re.search(,weblines[i+2])
print
print "Talks:",countTalk
print "Amnh-Talks:", amnhTalk