-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwayback.py
201 lines (172 loc) · 7.1 KB
/
wayback.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
"""
Fetch archival copies of a webpage from the internet archive.
First, fetch them with this file, then run "run_wayback" in get_us_fire_data to parse these files,
and put them into the datastore in json.
Note: I'm currently only downloading ones I don't have, but some of the ones I already have are california
only. I should delete those, and redownload. They can be spotted by being much smaller than the other files.
"""
import datetime
import json
import os
import get_ca_fire_data
from data_store import DataStore
import get_us_fire_data
from time import sleep
from urllib.error import HTTPError
import requests
import urllib
def fetch(url):
"""
:param url: The url to fetch data from.
:return: The contents of the webpage, or None
"""
try:
response = requests.get(url)
if response.status_code != 200:
print("Request failed: ", response.status_code)
return None
except HTTPError as e:
print(F'Got an error: {e}')
return None
except Exception as e:
print(F'Other exception: {e}')
return None
print(F"HTTP request succeed. Response size={len(response.content)}")
return response.content
def is_available(url):
wayback_url = "http://archive.org/wayback/available?url="
encoded = urllib.parse.quote_plus(url)
full_url = wayback_url + encoded
print(full_url)
data = fetch(full_url)
if data is not None:
print(data)
values = json.loads(data)
print(json.dumps(values, indent=4))
return data
def run_parsing(archive_directory, data_store, parse):
"""
This method takes already downloaded snapshots of a webpage, and extracts the fire data
using parse(), and saves it to the datastore, but only if it's not already present.
:return:
"""
files = os.listdir(archive_directory)
counter = 0
count_nr = 0
for filename in files:
if not filename.endswith(".html"):
continue
parts = filename[:-len(".html")].split('_')
assert len(parts) == 4
year = int(parts[1])
month = int(parts[2])
day = int(parts[3])
assert 1900 < year < 2100 and 1 <= month <= 12 and 1 <= day <=31
path = archive_directory + "/" + filename
with open(path) as f:
archive = f.read()
fire_data = parse(archive)
# Tag the entry with some meta data, so we can find them again, if we need to redo things.
if type(fire_data) == dict:
fire_data['_source'] = path
elif type(fire_data) == list:
# Problem: We use href as a unique ID a fire. It turns out not all fires have a unique href ("/incident//)
# instead of the usual "/incident/1234".
for incident in fire_data:
incident['_source'] = path
# We use the href as a unique identifier. We want the original href, not the internet archive version.
href:str = incident['href']
assert href.startswith("/web/")
index = href.find("/incident/")
assert index != -1
href = href[index:]
incident['href'] = href
#print(json.dumps(fire_data, indent=4))
data_date = datetime.date(year,month, day)
already_exists = data_store.does_data_exist(data_date) # TODO Move up.
if already_exists:
print(F"Skipping {os.path.join(path, filename)}")
else:
print(F"Writing from {filename}")
data_store.save_date_data(data_date, fire_data)
counter += 1
print
return counter
def fetch_all_snapshots(archive_dir, wayback_filename, target_url):
"""
Fetch all the snapshots for the website.
:return:
"""
# Read the list of snapshots.
with open(wayback_filename) as f:
data = f.read()
url_template = "http://web.archive.org/web/{timestamp}/{target_url}"
snapshots = data.split("\n")
pages_downloaded = 0
pages_failed = 0
pages_skipped = 0
for snapshot in snapshots:
fields = snapshot.split()
if len(fields) < 1:
print("Bad fields. End of data?")
break
date_string = fields[1]
assert 14 == len(date_string)
ymd = date_string[:8]
year = int(date_string[:4])
month = int(date_string[4:6])
day = int(date_string[6:8])
assert 1900 < year < 2100 and 1 <= month <= 12 and 1 <= day <=31
date_of_fire = datetime.date(year,month, day)
filename = F"firedata_{year}_{month:02}_{day:02}.html"
path = os.path.join(archive_dir, filename)
if os.path.exists(path):
print("Not replacing ", path)
pages_skipped += 1
continue
else:
print("Downloading for ", path)
url = url_template.format(timestamp=date_string, target_url=target_url)
print(url)
page = fetch(url)
if page is None:
print("Fetching above url failed.")
pages_failed +=1
continue
pages_downloaded += 1
with open(path, "wb") as f:
f.write(page)
print("Page saved")
sleep(2)
return pages_downloaded, pages_failed, pages_skipped
def fetch_us():
# I fetched the snapshot data manually, and saved it:
# curl http://web.archive.org/cdx/search/cdx?url=inciweb.nwcg.gov/accessible-view/
archive_dir = get_us_fire_data.get_archive_directory()
pages, pages_failed, pages_skipped = fetch_all_snapshots(archive_dir, "wayback_us_snapshots.txt", target_url='https://inciweb.nwcg.gov/accessible-view/')
print(F"{pages} pages downloaded, {pages_failed} pages failed to download. {pages_skipped} pages skipped. Total: {pages+pages_failed+pages_skipped}.")
return pages, pages_failed, pages_skipped
def fetch_ca():
archive_dir = get_ca_fire_data.get_archive_directory()
pages, pages_failed, pages_skipped = fetch_all_snapshots(archive_dir, "wayback_ca_snapshots.txt", target_url='https://www.fire.ca.gov/umbraco/Api/IncidentApi/GetIncidents')
print(F"{pages} pages downloaded, {pages_failed} pages failed to download. {pages_skipped} pages skipped. Total: {pages+pages_failed+pages_skipped}.")
return pages, pages_failed, pages_skipped
def update_from_archive_us():
archive_dir = get_us_fire_data.get_archive_directory()
data_store = get_us_fire_data.get_data_store()
counter = run_parsing(archive_dir, data_store, get_us_fire_data.parse)
print(F"Update {counter} records.")
def update_from_archive_ca():
archive_dir = get_ca_fire_data.get_archive_directory()
data_store = get_ca_fire_data.get_data_store()
counter = run_parsing(archive_dir, data_store, get_ca_fire_data.parse)
print(F"Update {counter} records.")
if __name__ == "__main__":
fetch_us()
update_from_archive_us()
fetch_ca()
# The file data/data_cal/source/firedata_2021_04_03.html is in XML format. Don't download it. I made a zero length version.
update_from_archive_ca()
# one_snapshot_url = "http://web.archive.org/web/20210603183442/https://inciweb.nwcg.gov/accessible-view/"
# page = fetch(one_snapshot_url)
# print(page)