-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
146 lines (115 loc) · 4.41 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from urllib.parse import urlsplit, urljoin
from bs4 import BeautifulSoup
import os
import time
import requests
import argparse
parse_uri = lambda url: urlsplit(url)._replace(fragment='')
get_uri_paths = lambda uri: uri.path.strip('/').split('/')[:-1]
class WebCrawler:
def __init__(self, args):
self.args = args
self.uri = parse_uri(self.args.url)
self.base_paths = get_uri_paths(self.uri)
self.queue = set()
self.cache = {}
def validate_uri(self, uri):
return not uri.scheme or uri.scheme in ['http', 'https'] \
and uri.netloc == self.uri.netloc
def check_depth(self, uri):
test_paths = get_uri_paths(uri)
base_level, test_level = len(self.base_paths), len(test_paths)
if self.args.depth < 0:
match_len = max(base_level + self.args.depth, 0)
return self.base_paths[:match_len] == test_paths[:match_len]
if test_level < base_level:
# test url can't be shorter
return False
return test_paths[:base_level] == self.base_paths \
and test_level - base_level <= self.args.depth
def fetch_and_save(self, uri):
if uri in self.cache and self.cache[uri]:
return self.cache[uri]
try:
res = requests.get(uri.geturl(), timeout=self.args.timeout)
content = res.content
if res.ok:
self.cache[uri] = content
filename = os.path.normpath(uri.netloc + uri.path)
filepath = os.path.join(self.args.output, filename)
dirpath = os.path.dirname(filepath)
os.makedirs(dirpath, exist_ok=True)
with open(filepath, 'wb') as file:
file.write(res.content)
return content
except Exception as error:
print(error)
print(f'Requesting {uri.geturl()} failed...')
self.cache[uri] = False
return None
def crawl(self, uri):
html = self.fetch_and_save(uri)
if not html:
print(f'{uri.geturl()} has nothing to crawl...')
return
soup = BeautifulSoup(html, 'html.parser')
# find all hrefs
href_tags = soup.find_all(href=True)
for href_tag in href_tags:
href_url = urljoin(uri.geturl(), href_tag.get('href'))
href_uri = parse_uri(href_url)
if not self.validate_uri(href_uri):
continue
if href_tag.name == 'link':
self.fetch_and_save(href_uri)
continue
if href_uri in self.cache or not self.check_depth(href_uri):
continue
self.queue.add(href_uri)
# find all media (image, gif, ...)
# js code too i guess
media_tags = soup.find_all(src=True)
for media_tag in media_tags:
media_url = urljoin(uri.geturl(), media_tag.get('src'))
media_uri = parse_uri(media_url)
if not self.validate_uri(media_uri):
continue
self.fetch_and_save(media_uri)
def start(self):
self.queue.add(self.uri)
start_time = time.perf_counter()
while self.queue:
uri = self.queue.pop()
print(f'Crawling {uri.geturl()}...')
self.crawl(uri)
failed_urls = [
uri.geturl() for uri, ok in self.cache.items() if not ok
]
failed_file = os.path.normpath(os.path.join(
self.args.output,
self.uri.netloc,
'failed.txt'
))
with open(failed_file, 'w') as file:
file.write('\n'.join(failed_urls))
time_elapsed = round(time.perf_counter() - start_time, 2)
print(f'Done! Time elasped: {time_elapsed}s')
print(f'Failed URLs: {len(failed_urls)} (see {failed_file})')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('url', type=str, help='starting url')
parser.add_argument(
'-o', '--output', type=str, default='output/',
help='output directory (default: output/)',
)
parser.add_argument(
'-t', '--timeout', type=int, default=10,
help='request\'s timeout in seconds (default: 10)',
)
parser.add_argument(
'-d', '--depth', type=int, default=0,
help='maximum depth of url (default: 0)',
)
args = parser.parse_args()
crawler = WebCrawler(args)
crawler.start()