Skip to content

Commit

Permalink
Merge pull request #13 from nationalarchives/754-no-overwrite-custom-pdf
Browse files Browse the repository at this point in the history
Check first that there isn't a custom-pdf that would be overwritten
  • Loading branch information
dragon-dxw authored Apr 14, 2023
2 parents 2cae495 + ee0627c commit 35dee56
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 54 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ repos:
- id: check-yaml

- repo: https://github.com/psf/black
rev: 23.1.0
rev: 23.3.0
hooks:
- id: black

Expand Down
20 changes: 19 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,24 @@ file for you, if you want to remake every PDF that's backed by a docx file.
## Local setup

1. From ds-caselaw-ingester, run `docker-compose up` to launch the Localstack container
2. From ds-caselaw-pdfconversion, run `scripts/setup-localstack` to set up the queues etc.
2. From ds-caselaw-pdfconversion, run `scripts/setup-localstack.sh` to set up the queues etc.
3. From ds-caselaw-pdfconversion, run `docker-compose up --build` to launch the LibreOffice container
(`--build` will ensure the converter script is in the docker container)

### Local testing

`pytest queue_listener/tests.py` will run unit tests.

Manual integration tests, having run Local Start up tasks above:

You should see output like:
```
Downloading judgment.docx
...
Uploaded judgment.pdf
```
on startup.

Running `scripts/upload_custom_file.sh` will do nothing, but then `scripts/upload_file.sh` should not upload and display the message:

`judgment.pdf is from custom-pdfs, not replacing`
150 changes: 98 additions & 52 deletions queue_listener/queue_listener.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
import subprocess

import boto3
import botocore
import dotenv
import rollbar

dotenv.load_dotenv()

rollbar.init(os.getenv("ROLLBAR_ACCESS_TOKEN"))
rollbar.init(
os.getenv("ROLLBAR_ACCESS_TOKEN"),
environment=os.getenv("ROLLBAR_ENV", default="unknown"),
)
QUEUE_URL = os.getenv("QUEUE_URL")
# should be UNSET whenever using actual AWS
# but set if we're using localstack
Expand All @@ -26,60 +30,102 @@
endpoint_url=ENDPOINT_URL,
)

while True:
print("Polling")

def would_replace_custom_pdf(bucket_name, upload_key):
"""
If a PDF file with the target name already exists, and has metadata of 'custom-pdfs',
we should not overwrite the file with an automatically generated PDF.
"""

# get the metadata from S3
try:
metadata = s3_client.head_object(Bucket=bucket_name, Key=upload_key)
except botocore.exceptions.ClientError as exception:
# we expect in most instances that the file won't exist, and that's OK
if exception.response["Error"]["Message"] == "Not Found":
metadata = {}
else:
raise

# get the source of the document from the S3 metadata
try:
source = metadata["ResponseMetadata"]["HTTPHeaders"]["x-amz-meta-pdfsource"]
except KeyError:
source = None

return source == "custom-pdfs"


def handle_message(message):
print(message)
json_body = json.loads(message["Body"])
for record in json_body.get("Records", []):
bucket_name = record["s3"]["bucket"]["name"] # or ['arn']
download_key = record["s3"]["object"]["key"]
etag = record["s3"]["object"]["eTag"].replace('"', "")
docx_filename = f"/tmp/{etag}.docx"
pdf_filename = f"/tmp/{etag}.pdf"

# split on dots, remove last part and recombine with dots again
# to have net effect of removing extension
key_no_extension = ".".join(download_key.split(".")[:-1])
upload_key = key_no_extension + ".pdf"

if would_replace_custom_pdf(bucket_name, upload_key):
rollbar_message = f"existing '{upload_key}' is from custom-pdfs, pdf-conversion is not overwriting it"
rollbar.report_message(rollbar_message, "warning")
print(rollbar_message)
continue

print(f"Downloading {download_key}")
s3_client.download_file(
Bucket=bucket_name, Key=download_key, Filename=docx_filename
)

print(
subprocess.run(
f"soffice --convert-to pdf {docx_filename} --outdir /tmp".split(" ")
)
)

# NOTE: there's a risk that the local pdf file doesn't exist, we need to handle that case.
try:
s3_client.upload_file(
Bucket=bucket_name,
Key=upload_key,
Filename=pdf_filename,
ExtraArgs={
"ContentType": "application/pdf",
"Metadata": {"pdfsource": "pdf-conversion-libreoffice"},
},
)
print(f"Uploaded {upload_key}")
except FileNotFoundError as exception:
print("LibreOffice probably didn't create a PDF for the input document.")
rollbar.report_exc_info()
print(exception)

for file_to_delete in [pdf_filename, docx_filename]:
try:
os.remove(file_to_delete)
except FileNotFoundError:
pass

# afterwards:
sqs_client.delete_message(
QueueUrl=QUEUE_URL, ReceiptHandle=message["ReceiptHandle"]
)


def poll_once():
print("Polling...")
messages_dict = sqs_client.receive_message(
QueueUrl=QUEUE_URL, WaitTimeSeconds=POLL_SECONDS
)
for message in messages_dict.get("Messages", []):
print(message)
json_body = json.loads(message["Body"])
for record in json_body.get("Records", []):
bucket_name = record["s3"]["bucket"]["name"] # or ['arn']
download_key = record["s3"]["object"]["key"]
etag = record["s3"]["object"]["eTag"].replace('"', "")
docx_filename = f"/tmp/{etag}.docx"
pdf_filename = f"/tmp/{etag}.pdf"

print(f"Downloading {download_key}")
s3_client.download_file(
Bucket=bucket_name, Key=download_key, Filename=docx_filename
)

print(
subprocess.run(
f"soffice --convert-to pdf {docx_filename} --outdir /tmp".split(" ")
)
)
handle_message(message)

# split on dots, remove last part and recombine with dots again
# to have net effect of removing extension
key_no_extension = ".".join(download_key.split(".")[:-1])
upload_key = key_no_extension + ".pdf"

# NOTE: there's a risk that some.pdf doesn't exist, we need to handle that case.
try:
s3_client.upload_file(
Bucket=bucket_name,
Key=upload_key,
Filename=pdf_filename,
ExtraArgs={"ContentType": "application/pdf"},
)
print(f"Uploaded {upload_key}")
except FileNotFoundError as exception:
print(
"LibreOffice probably didn't create a PDF for the input document."
)
rollbar.report_exc_info()
print(exception)

for file_to_delete in [pdf_filename, docx_filename]:
try:
os.remove(file_to_delete)
except FileNotFoundError:
pass

# afterwards:
sqs_client.delete_message(
QueueUrl=QUEUE_URL, ReceiptHandle=message["ReceiptHandle"]
)
if __name__ == "__main__":
while True:
poll_once()
65 changes: 65 additions & 0 deletions queue_listener/tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from unittest.mock import patch

import pytest
from botocore.exceptions import ClientError

import queue_listener


# TRUTHY
@patch(
"queue_listener.s3_client.head_object",
return_value={
"ResponseMetadata": {"HTTPHeaders": {"x-amz-meta-pdfsource": "custom-pdfs"}}
},
)
def test_would_replace_is_custom(head_object):
"""There is a pdfsource, but it is custom-pdfs"""
assert queue_listener.would_replace_custom_pdf("", "")
head_object.assert_called_once()


# FALSEY
@patch(
"queue_listener.s3_client.head_object",
return_value={
"ResponseMetadata": {"HTTPHeaders": {"x-amz-meta-pdfsource": "kitten"}}
},
)
def test_would_replace_not_custom(head_object):
"""There is a pdfsource, but it isn't custom-pdfs"""
assert not queue_listener.would_replace_custom_pdf("", "")
head_object.assert_called_once()


@patch("queue_listener.s3_client.head_object", return_value={})
def test_would_replace_is_empty(head_object):
"""There is a file, but no pdfsource header at all"""
assert not queue_listener.would_replace_custom_pdf("", "")
head_object.assert_called_once()


@patch(
"queue_listener.s3_client.head_object",
side_effect=ClientError(
error_response={"Error": {"Message": "Not Found"}}, operation_name=""
),
)
def test_would_replace_is_not_found(head_object):
"""There is no such file, so there's nothing to be worried about overwriting"""
assert not queue_listener.would_replace_custom_pdf("", "")
head_object.assert_called_once()


# ERRORY
@patch(
"queue_listener.s3_client.head_object",
side_effect=ClientError(
error_response={"Error": {"Message": "Out Of Cheese"}}, operation_name=""
),
)
def test_would_replace_is_bad_response(head_object):
"""An unexpected error occurred, so we re-raise"""
with pytest.raises(ClientError):
queue_listener.would_replace_custom_pdf("", "")
head_object.assert_called_once()
3 changes: 3 additions & 0 deletions scripts/upload_custom_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
source .env

awslocal s3 cp data/judgment.pdf s3://private-asset-bucket --metadata '{"pdfsource": "custom-pdfs"}'

0 comments on commit 35dee56

Please sign in to comment.