Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPN-415: updates to url_database.py, bi-weekly generation #1137

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions bin/resource_management/data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.csv
1 change: 1 addition & 0 deletions bin/resource_management/reports/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.csv
312 changes: 198 additions & 114 deletions bin/resource_management/url_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
Outputs status to 'url_database.csv'

Arguments:
fileinput - metadata file to be read ('od-do-canada.jsonl')
batch_size - maximum number of URL's to test in parallel
fileinput - metadata file to be read ('od-do-canada.jsonl') or retest 'N/A' responses from previous url_database.csv
batch_size - INT maximum number of URL's to test in parallel
filename - name of file to export
"""
import sys
import grequests
Expand All @@ -20,19 +21,19 @@

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

file = sys.argv[1]
batch_size = int(sys.argv[2])
def create_database(filename):
with open(filename, "w") as f:
writer = csv.writer(f)
writer.writerow(("url", "date", "response",
"content-type", "content-length"))
f.close()

print(file, batch_size)

prev_i = 1
urls = set()
batch_urls = []
url_match = []
responses = []
date = []
ftp_urls = []
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
def write_to_database(filename, rows):
with open(filename, "a") as f:
writer = csv.writer(f)
for row in rows:
writer.writerow(row)
maxFredenburgh marked this conversation as resolved.
Show resolved Hide resolved
f.close()

def check_for_connection():
url = "https://www.google.ca"
Expand All @@ -45,110 +46,193 @@ def check_for_connection():
except (requests.ConnectionError, requests.Timeout) as exception:
print("No internet connection.")

print("Starting...")
print("Reading and testing URL's")

for i, dataset in enumerate(open(file), 1):
line = json.loads(dataset)
resources = line["resources"]
for l in range(len(resources)):
url = resources[l]["url"].encode('utf-8')
if url in urls:
def get_batch_response(batch_urls, i, prev_i):
sys.stderr.write("\r")
sys.stderr.write("Testing Datasets {0} - {1}".format(prev_i, i))
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
rs = (grequests.head(u, timeout=60, headers=headers, verify=False, allow_redirects=True, stream=False) for u in
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

verify=False?

Copy link
Contributor Author

@maxFredenburgh maxFredenburgh Sep 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To prevent request.exceptions.SSLError

batch_urls)
batch_response = grequests.map(rs)
for j, r in enumerate(batch_response):
if not r is None:
r.close()
else:
batch_response[j] = "N/A"
return batch_response

def get_batch_response_ftp(batch_urls):
ftp_responses = []
ftp_dates = []
requests_ftp.monkeypatch_session()
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
maxFredenburgh marked this conversation as resolved.
Show resolved Hide resolved
for i, url in enumerate(batch_urls):
sys.stderr.write("\r")
sys.stderr.write("Testing FTP {0} of {1}".format(i, len(batch_urls)))
s = requests.Session()
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
ftp_dates.append(dt_string.encode('utf-8'))
try:
resp = s.head(url, timeout=120, headers=headers, verify=False, allow_redirects=True, stream=False)

if not resp is None:
s.close()
else:
resp = "N/A"
ftp_responses.append(resp)
except requests.exceptions.RequestException as e:
ftp_responses.append("N/A")
continue
except UnicodeEncodeError as e:
ftp_responses.append("N/A")
continue
elif 'ftp://' in url:
ftp_urls.append(url)
return ftp_responses, ftp_dates

def get_batch_content(batch_response):
content_lengths = []
content_types = []
for r in batch_response:
if "N/A" in r:
content_lengths.append('N/A')
content_types.append('N/A')
else:
urls.add(url)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
date.append(dt_string.encode('utf-8'))
batch_urls.append(url)

cl = r.headers.get("Content-Length")
ct = r.headers.get("Content-Type")
if cl is None:
cl = 'N/A'
if ct is None:
ct = 'N/A'
content_lengths.append(cl.encode('utf-8'))
content_types.append(ct.encode('utf-8'))
return content_lengths, content_types

def main(import_file, batch_size, filename):
# set local vars
prev_i = 1
all_urls = set()
ftp_urls = []
# batch lists
batch_urls = []
batch_dates = []
# create url_database file
create_database(filename)
# Open JSONL and retrieve urls
for i, dataset in enumerate(open(import_file), 1):
line = json.loads(dataset, "utf-8")
resources = line["resources"]
for l in range(len(resources)):
# append urls to temp list
url = resources[l]["url"]
url = str(url.encode('utf-8'))
if url in all_urls:
continue
elif 'ftp://' in url:
all_urls.add(url)
ftp_urls.append(url)
else:
all_urls.add(url)
batch_urls.append(url)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
batch_dates.append(dt_string.encode('utf-8'))
# when len(list) == batch_size
if len(batch_urls) == batch_size:
url_match.append(batch_urls)
sys.stderr.write("\r")
sys.stderr.write("Testing Datasets {0} - {1}"
.format(prev_i, i))
check_for_connection()
# get batch_responses as list
batch_response = get_batch_response(batch_urls, i, prev_i)
prev_i = i
rs = (grequests.head(u, timeout=120, headers=headers, verify=False, allow_redirects=True, stream=False) for u in batch_urls)
batch_response = grequests.map(rs)
responses.append(batch_response)
for r in batch_response:
if not r is None:
r.close()
# get content-length and content-type from responses list
batch_cl , batch_ct = get_batch_content(batch_response)
# zip urls, dates, responses, content-type, content-length
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
# write rows to csv
write_to_database(filename, rows)
# clear lists and repeat
batch_urls = []
check_for_connection()

# Check last urls not covered in loop
url_match.append(batch_urls)
sys.stderr.write("\r")
sys.stderr.write("Testing Datasets {0} - {1}".format(prev_i, i))
rs = (grequests.head(u, timeout=120, headers=headers, verify=False, allow_redirects=True, stream=False) for u in batch_urls)
batch_response = grequests.map(rs)
responses.append(batch_response)
for r in batch_response:
if not r is None:
r.close()
batch_dates = []

# Get response info for last URLS not included in final batch
check_for_connection()
batch_response = get_batch_response(batch_urls, i, prev_i)
batch_cl, batch_ct = get_batch_content(batch_response)
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)
batch_urls = []
batch_dates = []

# get response info for FTP links
check_for_connection()
ftp_batch_response, ftp_dates = get_batch_response_ftp(ftp_urls)
batch_cl, batch_ct = get_batch_content(ftp_batch_response)
rows = zip(ftp_urls, ftp_dates, ftp_batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)

def retest(import_file, batch_size, filename):
create_database(filename)
with open(import_file, "r") as csvfile:
reader = csv.reader(csvfile)
next(reader)
prev_i = 1
ftp_urls = []
# batch lists
batch_urls = []
batch_dates = []
for i, row in enumerate(reader):
url = row[0]
date = row[1]
response = row[2]
content_length = row[3]
content_type = row[4]
if 'N/A' in response:
if 'ftp://' in url:
ftp_urls.append(url)
else:
batch_urls.append(url)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
batch_dates.append(dt_string.encode('utf-8'))
# when len(list) == batch_size
if len(batch_urls) == batch_size:
check_for_connection()
# get batch_responses as list
batch_response = get_batch_response(batch_urls, i, prev_i)
prev_i = i
# get content-length and content-type from responses list
batch_cl, batch_ct = get_batch_content(batch_response)
# zip urls, dates, responses, content-type, content-length
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
# write rows to csv
write_to_database(filename, rows)
# clear lists and repeat
batch_urls = []
batch_dates = []
else:
write_to_database(filename,[row])
csvfile.close()
# Get response info for last URLS not included in final batch
check_for_connection()
batch_response = get_batch_response(batch_urls, i, prev_i)
batch_cl, batch_ct = get_batch_content(batch_response)
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)
batch_urls = []
batch_dates = []

# get response info for FTP links
check_for_connection()
ftp_batch_response, ftp_dates = get_batch_response_ftp(ftp_urls)
batch_cl, batch_ct = get_batch_content(ftp_batch_response)
rows = zip(ftp_urls, ftp_dates, ftp_batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)

if __name__ == '__main__':
import_file = sys.argv[1]
batch_size = int(sys.argv[2])
filename = sys.argv[3]
print(import_file, batch_size, filename)
if 'jsonl' in import_file or '.jl' in import_file:
main(import_file, batch_size, filename)
elif '.csv' in import_file:
retest(import_file, batch_size, filename)

#Testing FTP urls
ftp_batch = []
ftp_response = []

requests_ftp.monkeypatch_session()

for i, url in enumerate(ftp_urls):
sys.stderr.write("\r")
sys.stderr.write("Testing FTP {0} of {1}".format(i, len(ftp_urls)))
s = requests.Session()
try:
resp = s.head(url,timeout=120, headers=headers, verify=False, allow_redirects=True, stream=False)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
date.append(dt_string.encode('utf-8'))
ftp_batch.append(url)
ftp_response.append(resp)
if not resp is None:
s.close()
if i%batch_size == 0:
check_for_connection()
except requests.exceptions.RequestException as e:
print str(e)
ftp_batch.append(url)
ftp_response.append(None)
continue

responses.append(ftp_response)
url_match.append(ftp_batch)

print("Fetching content data...")

responses = sum(responses, [])
url_match = sum(url_match, [])
content_lengths = []
content_types = []
for z, r in enumerate(responses):
if r is None:
content_lengths.append('N/A')
content_types.append('N/A')
responses[z] = 'N/A'
else:
cl = r.headers.get("Content-Length")
ct = r.headers.get("Content-Type")
if cl is None:
cl = 'N/A'
if ct is None:
ct = 'N/A'
content_lengths.append(cl.encode('utf-8'))
content_types.append(ct.encode('utf-8'))

print("Exporting to csv...")
rows = zip(url_match, date, responses, content_types, content_lengths)

with open('url_database.csv', "w") as f:
writer = csv.writer(f)
writer.writerow(("url", "date", "response", "content-type",
"content-length"))
for row in rows:
writer.writerow(row)
f.close()
print("Done.")
8 changes: 8 additions & 0 deletions bin/resource_management/url_database_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
wget http://open.canada.ca/static/od-do-canada.jl.gz
gzip -d od-do-canada.jl.gz
python url_database.py od-do-canada.jl 500 data/url_database_draft.csv
python url_database.py data/url_database_draft.csv 200 data/url_database.csv
rm data/url_database_draft.csv
python url_metadata_match.py od-do-canada.jl data/url_database.csv
rm od-do-canada.jl
Loading