-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathScrap_wayback.py
137 lines (109 loc) · 5.44 KB
/
Scrap_wayback.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from wayback import WaybackClient
from bs4 import BeautifulSoup
from lib.imports import *
from sys import argv
import requests
def main():
print("[!] Starting `Scrap_wayback.py`")
client = WaybackClient() # waybackmachine client
if len(argv) == 1: # python3 Scrap_wayback.py
print("[!] Getting all snapshots")
all_timestamps = list(client.search('https://www.worldometers.info/coronavirus'))
print("[!] Getting all the last snapshots of each day")
last_timestamp_objects = getLastTimeStamps(all_timestamps)
for i in last_timestamp_objects:
print("[!] Getting the `file_name` of the old data")
print("[!] `file_name`: "+"data"+i.timestamp.strftime('-%d-%m-%Y'))
file_name = "data"+i.timestamp.strftime('-%d-%m-%Y')
print("[!] Getting the `raw_url` of the old data")
print("[!] `raw_url`: "+str(i.raw_url))
print("[!] `view_url`: "+str(i.view_url))
raw_url = i.raw_url
print("[!] Getting the http request from `raw_url`")
response = getResponceFromRawUrl(raw_url)
print("[!] Getting the `raw_html` from the response")
raw_html = response.text
print("[!] Checking for database 1040 error from WayBackMachine")
check_for_server_error = raw_html.find('Database connection failed: Too many connections (1040)')
if check_for_server_error >= 0:
with open('scrap_error.txt', '+a') as f:
f.write(f"[!] couldent fetch from {raw_url} because of database error 1040(too many connections)")
print("[!] 1040 Database Error")
print("[!] Continuing without Scraping this `raw_url`")
continue
print("[!] Souping the `raw_html`")
soup = BeautifulSoup(raw_html, 'html.parser')
print("[!] Getting the table")
titles, data = getTableList(soup)
print("[!] Saving the files into .json formate")
setJsonFile(file_name, data)
print("[!] Saving the files into .csv formate")
setCsvFile(file_name, titles, data)
print("[!] Completed")
elif (len(argv) == 3) and (argv[1] == '-r' or argv[1] == '--raw-url'):
print("[!] The `raw_url` from `--raw-url` or `-r`")
raw_url = argv[2]
print("[!] Getting all snapshots")
all_timestamps = list(client.search('https://www.worldometers.info/coronavirus'))
print("[!] Getting the date of `raw_url` and `view_url` from all the snapshots")
for i in all_timestamps:
if i.raw_url == raw_url:
date = i.timestamp.strftime('-%d-%m-%Y')
view_url = i.view_url
print("[!] `file_name`: "+"data"+date)
file_name = "data"+date
print("[!] `raw_url`: "+str(raw_url))
print("[!] `view_url`: "+str(view_url))
print("[!] Getting the http request from `raw_url`")
response = getResponceFromRawUrl(raw_url)
print("[!] Getting the `raw_html` from the response")
raw_html = response.text
print("[!] Checking for database 1040 error from WayBackMachine")
checkDatabaseError(raw_html, raw_url)
print("[!] Souping the `raw_html`")
soup = BeautifulSoup(raw_html, 'html.parser')
print("[!] Getting the table")
titles, data = getTableList(soup)
print("[!] Saving the files into .json formate")
setJsonFile(file_name, data)
print("[!] Saving the files into .csv formate")
setCsvFile(file_name, titles, data)
print("[!] Completed")
elif (len(argv) == 4) and (argv[1] == '-d' or argv[1] == '--date') and (argv[2] == 'last-snapshot' or argv[2] == 'first-snapshot'):
print("[!] The `data` and `snapshot_time` from `--date` or `-d` and `last-snapshot` or `first-snapshot`")
snapshot_time = argv[2]
date = argv[3]
print("[!] Getting all snapshots")
all_timestamps = list(client.search('https://www.worldometers.info/coronavirus'))
print(f"[!] Getting the snapshot from `{snapshot_time}` of date `{date}`")
raw_url, view_url = getSnapshotFromTime(snapshot_time, all_timestamps, date)
date = '-'+date
print("[!] `file_name`: "+"data"+date)
file_name = "data"+date
print("[!] `raw_url`: "+str(raw_url))
print("[!] `view_url`: "+str(view_url))
print("[!] Getting the http request from `raw_url`")
response = getResponceFromRawUrl(raw_url)
print("[!] Getting the `raw_html` from the response")
raw_html = response.text
print("[!] Checking for database 1040 error from WayBackMachine")
checkDatabaseError(raw_html, raw_url)
print("[!] Souping the `raw_html`")
soup = BeautifulSoup(raw_html, 'html.parser')
print("[!] Getting the table")
titles, data = getTableList(soup)
print("[!] Saving the files into .json formate")
setJsonFile(file_name, data)
print("[!] Saving the files into .csv formate")
setCsvFile(file_name, titles, data)
print("[!] Completed")
else:
print('''
Usage:
python3 Scrap_wayback.py
python3 Scrap_wayback.py [<--help>|<-h>]
python3 Scrap_wayback.py [<--raw-url>|<-r>] <raw_url>
python3 Scrap_wayback.py [<--date>|<-d>] <last-snapshot|first-snapshot> <date-like-29-01-2020>
''')
if __name__ == "__main__":
main()