Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OPN-415: updates to url_database.py, bi-weekly generation #1137

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions bin/resource_management/data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.csv
1 change: 1 addition & 0 deletions bin/resource_management/email_message.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Automated email of the OGP Broken_Resource_Links report.
2 changes: 2 additions & 0 deletions bin/resource_management/recipients.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
recipients=''
venv=''
1 change: 1 addition & 0 deletions bin/resource_management/reports/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.csv
310 changes: 197 additions & 113 deletions bin/resource_management/url_database.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
Outputs status to 'url_database.csv'

Arguments:
fileinput - metadata file to be read ('od-do-canada.jsonl')
batch_size - maximum number of URL's to test in parallel
fileinput - metadata file to be read ('od-do-canada.jsonl') or retest 'N/A' responses from previous url_database.csv
batch_size - INT maximum number of URL's to test in parallel
filename - name of file to export
"""
import sys
import grequests
Expand All @@ -20,19 +21,21 @@

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

file = sys.argv[1]
batch_size = int(sys.argv[2])
# use a generic user agent to detect services that render information pages instead of the actual data when a web browser user visits
GENERIC_WEB_CLIENT_AGENT = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}

print(file, batch_size)
def create_database(filename):
with open(filename, "w") as f:
writer = csv.writer(f)
writer.writerow(("url", "date", "response",
"content-type", "content-length"))
f.close()

prev_i = 1
urls = set()
batch_urls = []
url_match = []
responses = []
date = []
ftp_urls = []
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"}
def write_to_database(filename, rows):
with open(filename, "a") as f:
writer = csv.writer(f)
writer.writerows(rows)
f.close()

def check_for_connection():
url = "https://www.google.ca"
Expand All @@ -45,110 +48,191 @@ def check_for_connection():
except (requests.ConnectionError, requests.Timeout) as exception:
print("No internet connection.")

print("Starting...")
print("Reading and testing URL's")

for i, dataset in enumerate(open(file), 1):
line = json.loads(dataset)
resources = line["resources"]
for l in range(len(resources)):
url = resources[l]["url"].encode('utf-8')
if url in urls:
def get_batch_response(batch_urls, i, prev_i):
sys.stderr.write("\r")
sys.stderr.write("Testing Datasets {0} - {1}".format(prev_i, i))
rs = (grequests.head(u, timeout=60, headers=GENERIC_WEB_CLIENT_AGENT, verify=False, allow_redirects=True, stream=False) for u in
batch_urls)
batch_response = grequests.map(rs)
for j, r in enumerate(batch_response):
if not r is None:
r.close()
else:
batch_response[j] = "N/A"
return batch_response

def get_batch_response_ftp(batch_urls):
ftp_responses = []
ftp_dates = []
requests_ftp.monkeypatch_session()
for i, url in enumerate(batch_urls):
sys.stderr.write("\r")
sys.stderr.write("Testing FTP {0} of {1}".format(i, len(batch_urls)))
s = requests.Session()
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
ftp_dates.append(dt_string.encode('utf-8'))
try:
resp = s.head(url, timeout=120, headers=GENERIC_WEB_CLIENT_AGENT, verify=False, allow_redirects=True, stream=False)

if not resp is None:
s.close()
else:
resp = "N/A"
ftp_responses.append(resp)
except requests.exceptions.RequestException as e:
ftp_responses.append("N/A")
continue
elif 'ftp://' in url:
ftp_urls.append(url)
except UnicodeEncodeError as e:
ftp_responses.append("N/A")
continue
return ftp_responses, ftp_dates

def get_batch_content(batch_response):
content_lengths = []
content_types = []
for r in batch_response:
if "N/A" in r:
content_lengths.append('N/A')
content_types.append('N/A')
else:
urls.add(url)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
date.append(dt_string.encode('utf-8'))
batch_urls.append(url)

cl = r.headers.get("Content-Length")
ct = r.headers.get("Content-Type")
if cl is None:
cl = 'N/A'
if ct is None:
ct = 'N/A'
content_lengths.append(cl.encode('utf-8'))
content_types.append(ct.encode('utf-8'))
return content_lengths, content_types

def main(import_file, batch_size, filename):
# set local vars
prev_i = 1
all_urls = set()
ftp_urls = []
# batch lists
batch_urls = []
batch_dates = []
# create url_database file
create_database(filename)
# Open JSONL and retrieve urls
for i, dataset in enumerate(open(import_file), 1):
line = json.loads(dataset, "utf-8")
resources = line["resources"]
for l in range(len(resources)):
# append urls to temp list
url = resources[l]["url"]
url = str(url.encode('utf-8'))
if url in all_urls:
continue
elif 'ftp://' in url:
all_urls.add(url)
ftp_urls.append(url)
else:
all_urls.add(url)
batch_urls.append(url)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
batch_dates.append(dt_string.encode('utf-8'))
# when len(list) == batch_size
if len(batch_urls) == batch_size:
url_match.append(batch_urls)
sys.stderr.write("\r")
sys.stderr.write("Testing Datasets {0} - {1}"
.format(prev_i, i))
check_for_connection()
# get batch_responses as list
batch_response = get_batch_response(batch_urls, i, prev_i)
prev_i = i
rs = (grequests.head(u, timeout=120, headers=headers, verify=False, allow_redirects=True, stream=False) for u in batch_urls)
batch_response = grequests.map(rs)
responses.append(batch_response)
for r in batch_response:
if not r is None:
r.close()
# get content-length and content-type from responses list
batch_cl , batch_ct = get_batch_content(batch_response)
# zip urls, dates, responses, content-type, content-length
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
# write rows to csv
write_to_database(filename, rows)
# clear lists and repeat
batch_urls = []
check_for_connection()

# Check last urls not covered in loop
url_match.append(batch_urls)
sys.stderr.write("\r")
sys.stderr.write("Testing Datasets {0} - {1}".format(prev_i, i))
rs = (grequests.head(u, timeout=120, headers=headers, verify=False, allow_redirects=True, stream=False) for u in batch_urls)
batch_response = grequests.map(rs)
responses.append(batch_response)
for r in batch_response:
if not r is None:
r.close()

#Testing FTP urls
ftp_batch = []
ftp_response = []
batch_dates = []

# Get response info for last URLS not included in final batch
check_for_connection()
batch_response = get_batch_response(batch_urls, i, prev_i)
batch_cl, batch_ct = get_batch_content(batch_response)
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)
batch_urls = []
batch_dates = []

# get response info for FTP links
check_for_connection()
ftp_batch_response, ftp_dates = get_batch_response_ftp(ftp_urls)
batch_cl, batch_ct = get_batch_content(ftp_batch_response)
rows = zip(ftp_urls, ftp_dates, ftp_batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)

def retest(import_file, batch_size, filename):
create_database(filename)
with open(import_file, "r") as csvfile:
reader = csv.reader(csvfile)
next(reader)
prev_i = 1
ftp_urls = []
# batch lists
batch_urls = []
batch_dates = []
for i, row in enumerate(reader):
url = row[0]
date = row[1]
response = row[2]
content_length = row[3]
content_type = row[4]
if 'N/A' in response:
if 'ftp://' in url:
ftp_urls.append(url)
else:
batch_urls.append(url)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
batch_dates.append(dt_string.encode('utf-8'))
# when len(list) == batch_size
if len(batch_urls) == batch_size:
check_for_connection()
# get batch_responses as list
batch_response = get_batch_response(batch_urls, i, prev_i)
prev_i = i
# get content-length and content-type from responses list
batch_cl, batch_ct = get_batch_content(batch_response)
# zip urls, dates, responses, content-type, content-length
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
# write rows to csv
write_to_database(filename, rows)
# clear lists and repeat
batch_urls = []
batch_dates = []
else:
write_to_database(filename,[row])
csvfile.close()
# Get response info for last URLS not included in final batch
check_for_connection()
batch_response = get_batch_response(batch_urls, i, prev_i)
batch_cl, batch_ct = get_batch_content(batch_response)
rows = zip(batch_urls, batch_dates, batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)
batch_urls = []
batch_dates = []

# get response info for FTP links
check_for_connection()
ftp_batch_response, ftp_dates = get_batch_response_ftp(ftp_urls)
batch_cl, batch_ct = get_batch_content(ftp_batch_response)
rows = zip(ftp_urls, ftp_dates, ftp_batch_response, batch_ct, batch_cl)
write_to_database(filename, rows)

if __name__ == '__main__':
import_file = sys.argv[1]
batch_size = int(sys.argv[2])
filename = sys.argv[3]
print(import_file, batch_size, filename)
if 'jsonl' in import_file or '.jl' in import_file:
main(import_file, batch_size, filename)
elif '.csv' in import_file:
retest(import_file, batch_size, filename)

requests_ftp.monkeypatch_session()

for i, url in enumerate(ftp_urls):
sys.stderr.write("\r")
sys.stderr.write("Testing FTP {0} of {1}".format(i, len(ftp_urls)))
s = requests.Session()
try:
resp = s.head(url,timeout=120, headers=headers, verify=False, allow_redirects=True, stream=False)
now = datetime.now()
dt_string = now.strftime("%Y-%m-%d %H:%M:%S")
date.append(dt_string.encode('utf-8'))
ftp_batch.append(url)
ftp_response.append(resp)
if not resp is None:
s.close()
if i%batch_size == 0:
check_for_connection()
except requests.exceptions.RequestException as e:
print str(e)
ftp_batch.append(url)
ftp_response.append(None)
continue

responses.append(ftp_response)
url_match.append(ftp_batch)

print("Fetching content data...")

responses = sum(responses, [])
url_match = sum(url_match, [])
content_lengths = []
content_types = []
for z, r in enumerate(responses):
if r is None:
content_lengths.append('N/A')
content_types.append('N/A')
responses[z] = 'N/A'
else:
cl = r.headers.get("Content-Length")
ct = r.headers.get("Content-Type")
if cl is None:
cl = 'N/A'
if ct is None:
ct = 'N/A'
content_lengths.append(cl.encode('utf-8'))
content_types.append(ct.encode('utf-8'))

print("Exporting to csv...")
rows = zip(url_match, date, responses, content_types, content_lengths)

with open('url_database.csv', "w") as f:
writer = csv.writer(f)
writer.writerow(("url", "date", "response", "content-type",
"content-length"))
for row in rows:
writer.writerow(row)
f.close()
print("Done.")
14 changes: 14 additions & 0 deletions bin/resource_management/url_database_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#!/bin/bash
dir=$1
cd $dir
. recipients.config
wget http://open.canada.ca/static/od-do-canada.jl.gz
gzip -d od-do-canada.jl.gz
source $venv
python url_database.py od-do-canada.jl 500 data/url_database_draft.csv
python url_database.py data/url_database_draft.csv 200 data/url_database.csv
rm data/url_database_draft.csv
python url_metadata_match.py od-do-canada.jl data/url_database.csv
rm od-do-canada.jl
the_date=$(date +"%Y-%m-%d")
mail -s "Broken_Resource_Links-$the_date.xlsx" -a reports/Broken_Resource_Links-$the_date.xlsx $recipients < email_message.txt
Loading