From 7e7aa6d772d9164f686f6a56861b8f8835b069c5 Mon Sep 17 00:00:00 2001 From: hancush Date: Thu, 21 Oct 2021 14:44:52 -0500 Subject: [PATCH] Initial commit --- .gitignore | 1 + Dockerfile | 23 +++++++++++++++++++++++ Makefile | 29 +++++++++++++++++++++++++++++ attachments/.gitkeep | 0 docker-compose.yml | 14 ++++++++++++++ merged/.gitkeep | 0 requirements.txt | 1 + scripts/download_attachments.py | 29 +++++++++++++++++++++++++++++ 8 files changed, 97 insertions(+) create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 Makefile create mode 100644 attachments/.gitkeep create mode 100644 docker-compose.yml create mode 100644 merged/.gitkeep create mode 100644 requirements.txt create mode 100644 scripts/download_attachments.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e43b0f9 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.DS_Store diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a3c142f --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.9-slim + +LABEL maintainer "DataMade " + +RUN apt-get update && \ + apt-get install -y make libreoffice poppler-utils wget && \ + rm -rf /var/lib/apt/lists/* + +RUN mkdir /app +WORKDIR /app + +RUN which unoconv || ( \ + UNOCONV_PATH=/unoconv && \ + wget -P $UNOCONV_PATH https://raw.githubusercontent.com/dagwieers/unoconv/master/unoconv && \ + chmod 755 $UNOCONV_PATH/unoconv && \ + sed -i 's;#!/usr/bin/env python;#!/usr/bin/python3;' $UNOCONV_PATH/unoconv && \ + ln -s $UNOCONV_PATH/unoconv /usr/bin/unoconv \ +) + +COPY ./requirements.txt /app/requirements.txt +RUN pip install --no-cache-dir -r requirements.txt + +COPY . /app diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..3d8fb64 --- /dev/null +++ b/Makefile @@ -0,0 +1,29 @@ +ATTACHMENTS := $(shell python scripts/download_attachments.py) + +clean : + find attachments -type f -not -name .gitkeep -delete + find merged -type f -not -name .gitkeep -delete + +upload_% : merged/%.pdf + aws s3 + +merged/%.pdf : $(addsuffix .pdf,$(basename $(ATTACHMENTS))) + pdfunite $^ $@ + +attachments/%.pdf : attachments/%.xlsx + unoconv -f pdf $< + +attachments/%.pdf : attachments/%.doc + unoconv -f pdf $< + +attachments/%.pdf : attachments/%.docx + unoconv -f pdf $< + +attachments/%.pdf : attachments/%.ppt + unoconv -f pdf $< + +attachments/%.pdf : attachments/%.pptx + unoconv -f pdf $< + +attachments/%.pdf : attachments/%.rtf + unoconv -f pdf $< diff --git a/attachments/.gitkeep b/attachments/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..32892d7 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,14 @@ +version: '2.4' + +services: + merger: + image: councilmatic-document-merger + container_name: councilmatic-document-merger + build: . + stdin_open: true + tty: true + volumes: + - .:/app + environment: + attachment_links: '["https://metro.legistar.com/ViewReport.ashx?M=R&N=TextL5&GID=557&ID=7916&GUID=LATEST&Title=Board+Report", "http://metro.legistar1.com/metro/attachments/d368424c-b80c-4f9a-aa1d-d353194ee733.pdf", "http://metro.legistar1.com/metro/attachments/f4031730-38c1-48a3-a789-09a3f5c5862a.pdf", "http://metro.legistar1.com/metro/attachments/53d3670b-3aa3-4823-ac17-51e032395641.pdf", "http://metro.legistar1.com/metro/attachments/53985307-4ce2-4688-83e0-42c4c7a17f0e.pdf", "http://metro.legistar1.com/metro/attachments/c96860a8-a26d-4022-9b6c-ca010c3d165e.docx"]' + command: make merged/2021-0530.pdf \ No newline at end of file diff --git a/merged/.gitkeep b/merged/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..3beea6b --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +scrapelib diff --git a/scripts/download_attachments.py b/scripts/download_attachments.py new file mode 100644 index 0000000..49f8891 --- /dev/null +++ b/scripts/download_attachments.py @@ -0,0 +1,29 @@ +''' +Download specified attachments into the attachments/ directory and return +space-delimited list of attachment filenames. +''' +import json +import os +import sys + +import scrapelib + + +s = scrapelib.Scraper(retry_attempts=1) +filenames = [] + +for attachment_link in json.loads( + os.environ['attachment_links'].replace('\'', '"') +): + attachment = s.get(attachment_link) + + if 'https://metro.legistar.com/ViewReport.ashx' in attachment_link: + filename = 'root.pdf' + else: + filename = os.path.basename(attachment_link) + with open(os.path.join('attachments', filename), 'wb') as file: + file.write(attachment.content) + + filenames.append(os.path.join('attachments', filename)) + +sys.stdout.write(' '.join(filenames))