Skip to content

Commit

Permalink
Omit attachments that are not of the expected filetypes
Browse files Browse the repository at this point in the history
  • Loading branch information
hancush committed Nov 5, 2021
1 parent f5078d7 commit 6e84626
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 6 deletions.
24 changes: 18 additions & 6 deletions scripts/document_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,15 @@


def Link(url):

if url.startswith('https://metro.legistar.com/ViewReport.ashx'):
return BoardReportLink(url)

return DocumentLink(url)


class DocumentLink(object):
'''
https://metro.legistar1.com/metro/meetings/2021/10/2100_A_Operations%2C_Safety%2C_and_Customer_Experience_Committee_21-10-21_Agenda.pdf
http://metro.legistar1.com/metro/attachments/6484cdc7-3c2a-4598-abc5-9d846771158e.pdf
'''
class _Link(object):

def __init__(self, url):
self.url = url

Expand All @@ -26,6 +24,20 @@ def content(self):
response = s.get(self.url, retry_on_404=True)
return response.content

@property
def filename(self):
raise NotImplementedError()

@property
def filetype(self):
return os.path.splitext(self.filename)[-1]


class DocumentLink(_Link):
'''
https://metro.legistar1.com/metro/meetings/2021/10/2100_A_Operations%2C_Safety%2C_and_Customer_Experience_Committee_21-10-21_Agenda.pdf
http://metro.legistar1.com/metro/attachments/6484cdc7-3c2a-4598-abc5-9d846771158e.pdf
'''
@property
def filename(self):
'''
Expand All @@ -35,7 +47,7 @@ def filename(self):
return re.sub(r'%', '', os.path.basename(self.url))


class BoardReportLink(DocumentLink):
class BoardReportLink(_Link):
'''
https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID=7936&GUID=LATEST&Title=Board+Report
'''
Expand Down
13 changes: 13 additions & 0 deletions scripts/download_attachments.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,26 @@


if __name__ == '__main__':
ACCEPTED_FILETYPES = (
'.pdf',
'.xlsx',
'.doc',
'.docx',
'.ppt',
'.pptx',
'.rtf',
)

filenames = []

for attachment_link in json.loads(
os.environ['attachment_links'].replace('\'', '"')
):
link = Link(attachment_link)

if link.filetype not in ACCEPTED_FILETYPES:
continue

with open(os.path.join('attachments', link.filename), 'wb') as file:
file.write(link.content)

Expand Down

0 comments on commit 6e84626

Please sign in to comment.