Skip to content

Commit

Permalink
Merge pull request #314 from open-contracting/232-export-files-improv…
Browse files Browse the repository at this point in the history
…ements

232 export files improvements
  • Loading branch information
yolile authored Nov 10, 2023
2 parents bd49556 + 005d568 commit 9dc804b
Show file tree
Hide file tree
Showing 14 changed files with 78 additions and 72 deletions.
3 changes: 1 addition & 2 deletions core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,12 @@
"django.contrib.humanize",
"data_registry",
"markdownx",
"exporter",
]

MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
# Add before GZipMiddleware to modify its response.
"exporter.middleware.ContentEncodingMiddleware",
"data_registry.middleware.ContentEncodingMiddleware",
# This site is not affected by BREACH.
# https://docs.djangoproject.com/en/4.2/ref/middleware/#django.middleware.gzip.GZipMiddleware
"django.middleware.gzip.GZipMiddleware",
Expand Down
1 change: 0 additions & 1 deletion core/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

urlpatterns = [
path("", include("data_registry.urls"), name="data-registry"),
path("", include("exporter.urls"), name="exporter"),
path("admin/", admin.site.urls),
path("markdownx/", include("markdownx.urls")),
]
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions data_registry/templates/includes/files.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
<ul>
{% if files.full %}
<li>
<a href="{% url 'download-export' %}?spider={{ collection.source_id }}&job_id={{ job.id }}&full=true&suffix={{ suffix }}" rel="nofollow" download>{% translate "All time" %}</a>
<a href="{% url 'download' collection.id %}?name=full.{{ suffix }}" rel="nofollow" download>{% translate "All time" %}</a>
<span class="text-muted small">{{ files.full|humanfilesize }}</span>
</li>
{% endif %}
{% for file in files.by_year|dictsortreversed:"year" %}
<li>
<a href="{% url 'download-export' %}?spider={{ collection.source_id }}&job_id={{ job.id }}&year={{ file.year }}&suffix={{ suffix }}" rel="nofollow" download>{{ file.year }}</a>
<a href="{% url 'download' collection.id %}?name={{ file.year }}.{{ suffix }}" rel="nofollow" download>{{ file.year }}</a>
<span class="text-muted small">{{ file.size|humanfilesize }}</span>
</li>
{% empty %}
Expand Down
1 change: 1 addition & 0 deletions data_registry/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
path("", views.index, name="index"),
path("search/", views.search, name="search"),
path("publication/<int:id>", views.detail, name="detail"),
path("publication/<int:id>/download", views.download_export, name="download"),
# https://code.djangoproject.com/ticket/26556
path("i18n/setlang/", i18n.set_language, name="set-language"),
# Uncomment after re-integrating Spoonbill.
Expand Down
30 changes: 28 additions & 2 deletions data_registry/views.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
import string
from collections import defaultdict
from datetime import date, datetime, timedelta
Expand All @@ -11,20 +12,21 @@
from django.contrib.postgres.aggregates import ArrayAgg
from django.db.models import Count, OuterRef, Q, Subquery
from django.db.models.functions import Substr
from django.http.response import HttpResponse, JsonResponse
from django.http.response import FileResponse, HttpResponse, HttpResponseBadRequest, HttpResponseNotFound, JsonResponse
from django.shortcuts import get_object_or_404, redirect, render
from django.utils.translation import get_language, get_language_from_request
from django.utils.translation import gettext as _

from data_registry.models import Collection, Job
from data_registry.util import collection_queryset
from exporter.util import Export
from exporter.util import Export, TaskStatus

logger = logging.getLogger(__name__)

alphabets = defaultdict(lambda: string.ascii_uppercase)
# https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode#Basic_Cyrillic_alphabet
alphabets["ru"] = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
EXPORT_PATTERN = re.compile(r"\A(full|\d{4})\.(jsonl\.gz|csv\.tar\.gz|xlsx)\Z")


def index(request):
Expand Down Expand Up @@ -174,6 +176,30 @@ def spiders(request):
return JsonResponse(json.get("spiders"), safe=False)


def download_export(request, id):
name = request.GET.get("name")

# Guard against path traversal.
if not EXPORT_PATTERN.match(name):
return HttpResponseBadRequest("The name query string parameter is invalid")

collection = get_object_or_404(collection_queryset(request), id=id)
active_job = get_object_or_404(collection.job, active=True)

export = Export(active_job.id, basename=name)
if export.status != TaskStatus.COMPLETED:
return HttpResponseNotFound("File not found")

return FileResponse(
export.path.open("rb"),
as_attachment=True,
filename=f"{collection.source_id}_{name}",
# Set Content-Encoding to skip GZipMiddleware. (ContentEncodingMiddleware removes the empty header.)
# https://docs.djangoproject.com/en/4.2/ref/middleware/#module-django.middleware.gzip
headers={"Content-Encoding": ""},
)


def excel_data(request, job_id, job_range=None):
job = Job.objects.get(id=job_id)
export = Export(job_id)
Expand Down
5 changes: 0 additions & 5 deletions exporter/apps.py

This file was deleted.

7 changes: 0 additions & 7 deletions exporter/urls.py

This file was deleted.

34 changes: 0 additions & 34 deletions exporter/views.py

This file was deleted.

2 changes: 1 addition & 1 deletion tests/data_registry/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUpTestData(cls):
@patch("exporter.util.Export.get_files")
def test_detail(self, get_files):
get_files.return_value = {"jsonl": {"by_year": [{"year": 2022, "size": 1}]}}
url = f"/api/download_export?spider=paraguay_dncp_records&job_id={self.job.id}&year=2022&suffix=jsonl.gz"
url = f"/en/publication/{self.collection.id}/download?name=2022.jsonl.gz"

with self.assertNumQueries(2):
response = Client().get(f"/en/publication/{self.collection.id}")
Expand Down
63 changes: 45 additions & 18 deletions tests/exporter/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,64 @@

from django.test import Client, TestCase, override_settings

from data_registry.models import Collection


@override_settings(EXPORTER_DIR=os.path.join("tests", "fixtures"))
class ViewsTests(TestCase):
@classmethod
def setUp(cls):
cls.collection = Collection.objects.create(
id=2,
title="Dirección Nacional de Contrataciones Públicas (DNCP)",
source_id="abc",
public=True,
)
cls.job = cls.collection.job.create(
active=True,
id=2,
)
cls.collection_no_job = Collection.objects.create(
id=3,
title="Test",
source_id="abc",
public=True,
)
cls.collection_no_job.job.create(
active=True,
id=4,
)

def test_collection_not_found(self):
with self.assertNumQueries(1):
response = Client().get("/en/publication/10/download?name=2000.jsonl.gz")

self.assertEqual(response.status_code, 404)

def test_download_export_invalid_suffix(self):
with self.assertNumQueries(0):
response = Client().get("/api/download_export?suffix=invalid")
response = Client().get(f"/en/publication/{self.collection.id}/download?name=invalid")

self.assertEqual(response.status_code, 400)
self.assertEqual(response.content, b"Suffix not recognized")
self.assertEqual(response.content, b"The name query string parameter is invalid")

def test_download_export_empty_parameter(self):
for parameter in ("job_id", "year"):
with self.subTest(parameter=parameter):
with self.assertNumQueries(0):
response = Client().get(f"/api/download_export?suffix=jsonl.gz&{parameter}=")
with self.assertNumQueries(0):
response = Client().get(f"/en/publication/{self.collection.id}/download?name=")

self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")
self.assertEqual(response.status_code, 400)
self.assertEqual(response.content, b"The name query string parameter is invalid")

def test_download_export_waiting(self):
with self.assertNumQueries(0):
response = Client().get("/api/download_export?suffix=jsonl.gz&year=2000&job_id=0")

with self.assertNumQueries(2):
response = Client().get(f"/en/publication/{self.collection_no_job.id}/download?name=2000.jsonl.gz")
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")

@patch("exporter.util.Export.lockfile", new_callable=PropertyMock)
def test_download_export_running(self, exists):
with self.assertNumQueries(0):
response = Client().get("/api/download_export?suffix=jsonl.gz&year=2000&job_id=1")
with self.assertNumQueries(2):
response = Client().get(f"/en/publication/{self.collection.id}/download?name=2000.jsonl.gz")

self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")
Expand All @@ -44,12 +72,11 @@ def test_download_export_completed(self):
("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
):
with self.subTest(suffix=suffix):
with self.assertNumQueries(0):
with self.assertNumQueries(2):
response = Client().get(
f"/api/download_export?suffix={suffix}&year=2000&job_id=1&spider=abc",
f"/en/publication/{self.collection.id}/download?name=2000.{suffix}",
HTTP_ACCEPT_ENCODING="gzip",
)

self.assertEqual(response.status_code, 200)
response.headers.pop("Content-Length")
self.assertDictEqual(
Expand All @@ -60,10 +87,10 @@ def test_download_export_completed(self):
"Content-Type": content_type,
"Cross-Origin-Opener-Policy": "same-origin",
"Referrer-Policy": "same-origin",
"Vary": "Accept-Language",
"Vary": "Cookie",
"X-Content-Type-Options": "nosniff",
"X-Frame-Options": "DENY",
},
)
with open(os.path.join("tests", "fixtures", "1", f"2000.{suffix}"), "rb") as f:
with open(os.path.join("tests", "fixtures", "2", f"2000.{suffix}"), "rb") as f:
self.assertEqual(b"".join(response.streaming_content), f.read())
File renamed without changes.
File renamed without changes.
File renamed without changes.

0 comments on commit 9dc804b

Please sign in to comment.