-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlink_crawler.py
85 lines (75 loc) · 2.79 KB
/
link_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# -*- encoding:utf8 -*-
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import robotparser
from downloader import Downloader
import re
from lxml import etree
from purl import URL
import urlparse
same_host = lambda url1, url2: URL(url1).host() == URL(url2).host()
get_links = lambda html: etree.HTML(html).xpath('//a/@href')
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, user_agent='Safari',
proxies=None, num_retries=1, scrape_callback=None, cache=None, robots_url=None,
only_same_host=True, **kwargs):
"""
根据给定seed_url递归爬取数据
robots_url: 给出robotsurl 则遵守robot规则,否则不遵守
"""
crawl_queue = [seed_url]
seen = {seed_url: 0} # 已爬取url和爬虫深度
num_urls = 0 # 爬取个数
rp = get_robots(robots_url) # robots.txt规则
# 定义下载器
D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, num_retries=num_retries,
cache=cache, save_cache=kwargs.get('save_cache', True),
timeout=kwargs.get('timeout', 60))
while crawl_queue:
url = crawl_queue.pop()
depth = seen[url]
if (not rp) or rp.can_fetch(useragent, url):
html = D(url)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
if depth != max_depth:
# 匹配符合条件的link
if link_regex:
links.extend(link for link in get_links(html) if re.match(link_regex, link))
# 将符合条件的link加入下载队列
for link in links:
link = normalize(seed_url, link)
if link not in seen:
seen[link] = depth + 1
if only_same_host and same_host(seed_url, link):
crawl_queue.append(link)
elif not only_same_host:
crawl_queue.append(link)
num_urls += 1
# 达到最大爬虫数则停止
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt', url
def get_robots(robots_url):
"""
robots.txt解析器
"""
if robots_url:
rp = robotparser.RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp
else:
return None
def normalize(seed_url, link):
"""
相对链接改为绝对连接
去除link hash
"""
link, _ = urlparse.urldefrag(link)
return urlparse.urljoin(seed_url, link)
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/(index|view)', delay=0,
max_urls=15, num_retries=1, user_agent='BadCrawler')