-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrap_forever.py
107 lines (91 loc) · 3.95 KB
/
scrap_forever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import requests
import re
import argparse
import concurrent.futures
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
parser = argparse.ArgumentParser(description='Requests.')
parser.add_argument('-t', dest='scrap_url', help='https://site.com.br', required=True)
parser.add_argument('-o', '--output', type=argparse.FileType(mode='w'), dest='scrap_out', help='Salva data.',
required=True)
parser.add_argument('-p', '--proxy', dest='scrap_proxy', help='-p http://127.0.0.1:8080', required=False)
parser.add_argument('-a', '--auth', dest='scrap_auth', help='-a Besic aqsdiqjewqd==', required=False)
parser.add_argument('-c', '--cookie', dest='scrap_cookie', help='-c PHPSESSION=cookie', required=False)
parser.add_argument('--user-agent', dest='scrap_user_agent', help='--user-agent Mozilla/5.0', required=False)
parser.add_argument('-n', '--threads', dest='num_threads', type=int, default=10,
help='Number of threads (default: 10)')
args = parser.parse_args()
if not len(args.scrap_url) > 1:
print('''
python3 forever_scrap.py -h/--help
-t Target to web scraping (Ex. -t https://site.com)
-o output file (Ex. -o output.txt )
-p/--proxy set proxy to request (Ex. -p http://127.0.0.1:8080)
-c/--cookie set cookie (Ex. -c Authorization: aqsdiqjewqd==)
-n/--threads Number of threads (default: 10)
Usage:
./forever_scrap.py -t site.com -o output.txt
''')
# global List of Results
global_url = []
# url base request
first_url = args.scrap_url
# request and return html
def resq_urls(url_x):
headers = {
'authorization': f'{args.scrap_auth}',
'cookie': f'{args.scrap_cookie}',
'user-agent': f'{args.scrap_user_agent}'
}
if args.scrap_proxy is None:
response = requests.get(url_x, verify=False, headers=headers)
return response.text
elif args.scrap_proxy is not None:
proxy = {'http': f'{args.scrap_proxy}', 'https': f'{args.scrap_proxy}'}
response = requests.get(url_x, proxies=proxy, verify=False, headers=headers)
return response.text
# Def find URL in pag
def find_url(response_x):
comp = re.compile(
"https?:\\/\\/(?:www\\.)?[-a-zA-Z0-9@:%._\\+~#=]{1,256}\\.[a-zA-Z0.9()]{1,6}\\b(?:[-a-zA-Z0.9()@:%_\\+.~#?&\\/=]*)")
x = re.findall(comp, response_x)
for list in x:
if first_url in list:
if list not in global_url:
global_url.append(list)
# Def get HREF
def find_href(response_x):
comp_href = re.compile(r'href=[\'"]?([^\'" >]+)')
x = re.findall(comp_href, response_x)
for list in x:
if f'{first_url}/{list}' not in global_url:
if 'http' in f'{list}':
if first_url not in list:
pass
else:
global_url.append(f'{list}')
elif 'https' in f'{list}':
if first_url not in list:
pass
else:
global_url.append(f'{list}')
elif '/' in f'{first_url}/{list}':
global_url.append(f'{first_url}/{list}')
# base first Request
resq_urls(first_url)
find_url(resq_urls(first_url))
find_href(resq_urls(first_url))
# Collect All Url in All Pages using multi-threading
def process_url(url):
find_url(resq_urls(url))
find_href(resq_urls(url))
print(f'List URL: {url}', end='\r', flush=True)
args.scrap_out.write(url + '\n')
with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_threads) as executor:
urls_to_process = global_url.copy()
future_to_url = {executor.submit(process_url, url): url for url in urls_to_process}
concurrent.futures.wait(future_to_url)
# tratar href that contain http and https.
# think about creating two lists, one with only the domain and another with possible false positives.
for out in global_url:
print(out)