Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
hancush committed Oct 21, 2021
0 parents commit 7e7aa6d
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.DS_Store
23 changes: 23 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
FROM python:3.9-slim

LABEL maintainer "DataMade <[email protected]>"

RUN apt-get update && \
apt-get install -y make libreoffice poppler-utils wget && \
rm -rf /var/lib/apt/lists/*

RUN mkdir /app
WORKDIR /app

RUN which unoconv || ( \
UNOCONV_PATH=/unoconv && \
wget -P $UNOCONV_PATH https://raw.githubusercontent.com/dagwieers/unoconv/master/unoconv && \
chmod 755 $UNOCONV_PATH/unoconv && \
sed -i 's;#!/usr/bin/env python;#!/usr/bin/python3;' $UNOCONV_PATH/unoconv && \
ln -s $UNOCONV_PATH/unoconv /usr/bin/unoconv \
)

COPY ./requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt

COPY . /app
29 changes: 29 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
ATTACHMENTS := $(shell python scripts/download_attachments.py)

clean :
find attachments -type f -not -name .gitkeep -delete
find merged -type f -not -name .gitkeep -delete

upload_% : merged/%.pdf
aws s3

merged/%.pdf : $(addsuffix .pdf,$(basename $(ATTACHMENTS)))
pdfunite $^ $@

attachments/%.pdf : attachments/%.xlsx
unoconv -f pdf $<

attachments/%.pdf : attachments/%.doc
unoconv -f pdf $<

attachments/%.pdf : attachments/%.docx
unoconv -f pdf $<

attachments/%.pdf : attachments/%.ppt
unoconv -f pdf $<

attachments/%.pdf : attachments/%.pptx
unoconv -f pdf $<

attachments/%.pdf : attachments/%.rtf
unoconv -f pdf $<
Empty file added attachments/.gitkeep
Empty file.
14 changes: 14 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
version: '2.4'

services:
merger:
image: councilmatic-document-merger
container_name: councilmatic-document-merger
build: .
stdin_open: true
tty: true
volumes:
- .:/app
environment:
attachment_links: '["https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID=7916&GUID=LATEST&Title=Board+Report", "http://metro.legistar1.com/metro/attachments/d368424c-b80c-4f9a-aa1d-d353194ee733.pdf", "http://metro.legistar1.com/metro/attachments/f4031730-38c1-48a3-a789-09a3f5c5862a.pdf", "http://metro.legistar1.com/metro/attachments/53d3670b-3aa3-4823-ac17-51e032395641.pdf", "http://metro.legistar1.com/metro/attachments/53985307-4ce2-4688-83e0-42c4c7a17f0e.pdf", "http://metro.legistar1.com/metro/attachments/c96860a8-a26d-4022-9b6c-ca010c3d165e.docx"]'
command: make merged/2021-0530.pdf
Empty file added merged/.gitkeep
Empty file.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
scrapelib
29 changes: 29 additions & 0 deletions scripts/download_attachments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
'''
Download specified attachments into the attachments/ directory and return
space-delimited list of attachment filenames.
'''
import json
import os
import sys

import scrapelib


s = scrapelib.Scraper(retry_attempts=1)
filenames = []

for attachment_link in json.loads(
os.environ['attachment_links'].replace('\'', '"')
):
attachment = s.get(attachment_link)

if 'https://metro.legistar.com/ViewReport.ashx' in attachment_link:
filename = 'root.pdf'
else:
filename = os.path.basename(attachment_link)
with open(os.path.join('attachments', filename), 'wb') as file:
file.write(attachment.content)

filenames.append(os.path.join('attachments', filename))

sys.stdout.write(' '.join(filenames))

0 comments on commit 7e7aa6d

Please sign in to comment.