forked from vikas0713/wikiCrawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
32 lines (28 loc) · 949 Bytes
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import urllib2
from bs4 import BeautifulSoup
disease=urllib2.urlopen('http://en.wikipedia.org/wiki/List_of_diseases_(A)')
bs=BeautifulSoup(disease)
[s.extract() for s in bs('table')]
data=bs.find('div',attrs={'id':'mw-content-text'}).findAll('li')
for a in data:
if None in a:
pass
else:
li=a.findAll('a')
for i in li:
v=i.text
m=v.encode('utf-8')
url=i['href']
try:
html=urllib2.urlopen('http://en.wikipedia.org'+url)
bs=BeautifulSoup(html)
try:
data=bs.find('div',attrs={'id':'mw-content-text'}).findAll('p',limit=1)
for j in data:
b= j.text
b=b.encode('utf-8')
print "name:",v,"description:",b
except:
print "there is error"
except:
pass