Skip to content

Commit

Permalink
Handle different types of links
Browse files Browse the repository at this point in the history
  • Loading branch information
hancush committed Oct 25, 2021
1 parent 443ff63 commit 938e7ba
Show file tree
Hide file tree
Showing 4 changed files with 59 additions and 11 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
.DS_Store
scripts/__pycache__
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@ services:
- .:/app
environment:
S3_BUCKET_NAME: datamade-metro-pdf-merger-testing
attachment_links: '["https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID=7916&GUID=LATEST&Title=Board+Report", "http://metro.legistar1.com/metro/attachments/d368424c-b80c-4f9a-aa1d-d353194ee733.pdf", "http://metro.legistar1.com/metro/attachments/f4031730-38c1-48a3-a789-09a3f5c5862a.pdf", "http://metro.legistar1.com/metro/attachments/53d3670b-3aa3-4823-ac17-51e032395641.pdf", "http://metro.legistar1.com/metro/attachments/53985307-4ce2-4688-83e0-42c4c7a17f0e.pdf", "http://metro.legistar1.com/metro/attachments/c96860a8-a26d-4022-9b6c-ca010c3d165e.docx"]'
attachment_links: '["https://metro.legistar1.com/metro/meetings/2021/10/2100_A_Operations%2C_Safety%2C_and_Customer_Experience_Committee_21-10-21_Agenda.pdf", "https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID=7916&GUID=LATEST&Title=Board+Report", "http://metro.legistar1.com/metro/attachments/d368424c-b80c-4f9a-aa1d-d353194ee733.pdf", "http://metro.legistar1.com/metro/attachments/f4031730-38c1-48a3-a789-09a3f5c5862a.pdf", "http://metro.legistar1.com/metro/attachments/53d3670b-3aa3-4823-ac17-51e032395641.pdf", "http://metro.legistar1.com/metro/attachments/53985307-4ce2-4688-83e0-42c4c7a17f0e.pdf", "http://metro.legistar1.com/metro/attachments/c96860a8-a26d-4022-9b6c-ca010c3d165e.docx"]'
command: make merged/2021-0530.pdf
52 changes: 52 additions & 0 deletions scripts/document_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import re
from urllib.parse import urlparse, parse_qs

import scrapelib


def Link(url):
if url.startswith('https://metro.legistar.com/ViewReport.ashx'):
return BoardReportLink(url)

return DocumentLink(url)


class DocumentLink(object):
'''
https://metro.legistar1.com/metro/meetings/2021/10/2100_A_Operations%2C_Safety%2C_and_Customer_Experience_Committee_21-10-21_Agenda.pdf
http://metro.legistar1.com/metro/attachments/6484cdc7-3c2a-4598-abc5-9d846771158e.pdf
'''
def __init__(self, url):
self.url = url

@property
def content(self):
s = scrapelib.Scraper(retry_attempts=2)
response = s.get(self.url)
return response.content

@property
def filename(self):
'''
Return the filename from the URL, less any percent signs from escaped
characters (commas, spaces, etc.), as these will confuse Make.
'''
return re.sub(r'%', '', os.path.basename(self.url))


class BoardReportLink(DocumentLink):
'''
https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID=7936&GUID=LATEST&Title=Board+Report
'''
@property
def filename(self):
'''
Parse the report ID out of the URL and use it to create a unique
filename.
'''
board_report_id, = parse_qs(
urlparse(self.url).query
)['ID']

return f'board_report_{board_report_id}.pdf'
15 changes: 5 additions & 10 deletions scripts/download_attachments.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,19 @@
import os
import sys

import scrapelib
from document_link import Link


s = scrapelib.Scraper(retry_attempts=2)
filenames = []

for attachment_link in json.loads(
os.environ['attachment_links'].replace('\'', '"')
):
attachment = s.get(attachment_link)
link = Link(attachment_link)

if 'https://metro.legistar.com/ViewReport.ashx' in attachment_link:
filename = 'root.pdf'
else:
filename = os.path.basename(attachment_link)
with open(os.path.join('attachments', filename), 'wb') as file:
file.write(attachment.content)
with open(os.path.join('attachments', link.filename), 'wb') as file:
file.write(link.content)

filenames.append(os.path.join('attachments', filename))
filenames.append(os.path.join('attachments', link.filename))

sys.stdout.write(' '.join(filenames))

0 comments on commit 938e7ba

Please sign in to comment.