-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawl_web.py
76 lines (75 loc) · 2.43 KB
/
crawl_web.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
def record_user_click(index,keyword,url):
urls=lookup(index,keyword)
if urls:
for entry in urls:
if entry[0]==url:
entry[1]+=1
def add_to_index(index,keyword,url):
for entry in index:
if keyword==entry[0]:
for element in entry:
if url==element[0]:
return
entry[1].append([url,0])
return
index.append([keyword,[[url,0]]])
def get_page(url):
try:
if url=="http://www.udacity.com/cs101x/index.html":
return '''<html> <body> This is a test page for learning to crawl! <p> It is a good idea to <a href="http://www.udacity.com/cs101x/crawling.html"> learn to crawl</a> before you try to <a href="http://www.udacity.com/cs101x/walking.html">walk</a> or <a href="http://www.udacity.com/cs101x/flying.html">fly</a>.</p></body></html>'''
elif url=="http://www.udacity.com/cs101x/crawling.html":
return '''<html> <body> I have not learned to crawl yet, but I amquite good at <a href="http://www.udacity.com/cs101x/kicking.html">kicking</a>.</body> </html>'''
elif url=="http://www.udacity.com/cs101x/walking.html":
return '''<html> <body> I cant get enougth <a href="http://udacity.com/cs101x/index.html">crawing</a>!</body></html>'''
elif url=="http://www.udacity.com/cs101x/flying.html":
return '<html><body>The magic words are Squeamish Ossifrage!</body></html>'
except:
return ""
return ""
def union(a,b):
for e in b:
if e not in a:
a.append(e)
def get_next_target(page):
start_link=page.find('<a href=')
if start_link==-1:
return None,0
start_quote=page.find('"',start_link)
end_quote=page.find('"',start_quote+1)
url=page[start_quote+1:end_quote]
return url,end_quote
def get_all_links(page):
links=[]
while True:
url,endpos=get_next_target(page)
if url:
links.append(url)
page=page[endpos:]
else:
break
return links
def crawl_web(seed):
tocrawl=[seed]
crawled=[]
index=[]
while tocrawl:
page=tocrawl.pop()
if page not in crawled:
content=get_page(page)
add_page_to_index(index,page,content)
union(tocrawl,get_all_links(content))
crawled.append(page)
return index
def add_page_to_index(index,url,content):
words=content.split()
for word in words:
add_to_index(index,word,url)
def lookup(index,keyword):
for entry in index:
if entry[0]==keyword:
return entry[1]
return None
index=crawl_web('http://www.udacity.com/cs101x/index.html')
print lookup(index,'good')
record_user_click(index,'good','http://www.udacity.com/cs101x/crawling.html')
print lookup(index,'good')