diff --git a/core/settings.py b/core/settings.py
index d639b0eb..6a6f7f17 100644
--- a/core/settings.py
+++ b/core/settings.py
@@ -54,13 +54,12 @@
"django.contrib.humanize",
"data_registry",
"markdownx",
- "exporter",
]
MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
# Add before GZipMiddleware to modify its response.
- "exporter.middleware.ContentEncodingMiddleware",
+ "data_registry.middleware.ContentEncodingMiddleware",
# This site is not affected by BREACH.
# https://docs.djangoproject.com/en/4.2/ref/middleware/#django.middleware.gzip.GZipMiddleware
"django.middleware.gzip.GZipMiddleware",
diff --git a/core/urls.py b/core/urls.py
index 73a61136..8ba19bde 100644
--- a/core/urls.py
+++ b/core/urls.py
@@ -4,7 +4,6 @@
urlpatterns = [
path("", include("data_registry.urls"), name="data-registry"),
- path("", include("exporter.urls"), name="exporter"),
path("admin/", admin.site.urls),
path("markdownx/", include("markdownx.urls")),
]
diff --git a/exporter/middleware.py b/data_registry/middleware.py
similarity index 100%
rename from exporter/middleware.py
rename to data_registry/middleware.py
diff --git a/data_registry/templates/includes/files.html b/data_registry/templates/includes/files.html
index 8859771b..dbd26438 100644
--- a/data_registry/templates/includes/files.html
+++ b/data_registry/templates/includes/files.html
@@ -4,13 +4,13 @@
{% if files.full %}
-
- {% translate "All time" %}
+ {% translate "All time" %}
{{ files.full|humanfilesize }}
{% endif %}
{% for file in files.by_year|dictsortreversed:"year" %}
-
- {{ file.year }}
+ {{ file.year }}
{{ file.size|humanfilesize }}
{% empty %}
diff --git a/data_registry/urls.py b/data_registry/urls.py
index 7e9b21c1..2d6ac3de 100644
--- a/data_registry/urls.py
+++ b/data_registry/urls.py
@@ -6,6 +6,7 @@
path("", views.index, name="index"),
path("search/", views.search, name="search"),
path("publication/", views.detail, name="detail"),
+ path("publication//download", views.download_export, name="download"),
# https://code.djangoproject.com/ticket/26556
path("i18n/setlang/", i18n.set_language, name="set-language"),
# Uncomment after re-integrating Spoonbill.
diff --git a/data_registry/views.py b/data_registry/views.py
index b5bbe736..b991ca7d 100644
--- a/data_registry/views.py
+++ b/data_registry/views.py
@@ -1,4 +1,5 @@
import logging
+import re
import string
from collections import defaultdict
from datetime import date, datetime, timedelta
@@ -11,20 +12,21 @@
from django.contrib.postgres.aggregates import ArrayAgg
from django.db.models import Count, OuterRef, Q, Subquery
from django.db.models.functions import Substr
-from django.http.response import HttpResponse, JsonResponse
+from django.http.response import FileResponse, HttpResponse, HttpResponseBadRequest, HttpResponseNotFound, JsonResponse
from django.shortcuts import get_object_or_404, redirect, render
from django.utils.translation import get_language, get_language_from_request
from django.utils.translation import gettext as _
from data_registry.models import Collection, Job
from data_registry.util import collection_queryset
-from exporter.util import Export
+from exporter.util import Export, TaskStatus
logger = logging.getLogger(__name__)
alphabets = defaultdict(lambda: string.ascii_uppercase)
# https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode#Basic_Cyrillic_alphabet
alphabets["ru"] = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
+EXPORT_PATTERN = re.compile(r"\A(full|\d{4})\.(jsonl\.gz|csv\.tar\.gz|xlsx)\Z")
def index(request):
@@ -174,6 +176,30 @@ def spiders(request):
return JsonResponse(json.get("spiders"), safe=False)
+def download_export(request, id):
+ name = request.GET.get("name")
+
+ # Guard against path traversal.
+ if not EXPORT_PATTERN.match(name):
+ return HttpResponseBadRequest("The name query string parameter is invalid")
+
+ collection = get_object_or_404(collection_queryset(request), id=id)
+ active_job = get_object_or_404(collection.job, active=True)
+
+ export = Export(active_job.id, basename=name)
+ if export.status != TaskStatus.COMPLETED:
+ return HttpResponseNotFound("File not found")
+
+ return FileResponse(
+ export.path.open("rb"),
+ as_attachment=True,
+ filename=f"{collection.source_id}_{name}",
+ # Set Content-Encoding to skip GZipMiddleware. (ContentEncodingMiddleware removes the empty header.)
+ # https://docs.djangoproject.com/en/4.2/ref/middleware/#module-django.middleware.gzip
+ headers={"Content-Encoding": ""},
+ )
+
+
def excel_data(request, job_id, job_range=None):
job = Job.objects.get(id=job_id)
export = Export(job_id)
diff --git a/exporter/apps.py b/exporter/apps.py
deleted file mode 100644
index e8fed610..00000000
--- a/exporter/apps.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from django.apps import AppConfig
-
-
-class ExporterConfig(AppConfig):
- name = "exporter"
diff --git a/exporter/urls.py b/exporter/urls.py
deleted file mode 100644
index 96e067fd..00000000
--- a/exporter/urls.py
+++ /dev/null
@@ -1,7 +0,0 @@
-from django.urls import path
-
-from exporter import views
-
-urlpatterns = [
- path("api/download_export", views.download_export, name="download-export"),
-]
diff --git a/exporter/views.py b/exporter/views.py
deleted file mode 100644
index 4595a9b1..00000000
--- a/exporter/views.py
+++ /dev/null
@@ -1,34 +0,0 @@
-from django.http import FileResponse
-from django.http.response import HttpResponseBadRequest, HttpResponseNotFound
-
-from exporter.util import Export, TaskStatus
-
-
-def download_export(request):
- """
- Returns an exported file as a FileResponse object.
- """
- job_id = int(request.GET.get("job_id") or 0) # guard against path traversal
- full = request.GET.get("full")
- year = int(request.GET.get("year") or 0) # guard against path traversal
- suffix = request.GET.get("suffix")
- spider = request.GET.get("spider")
-
- # Guard against path traversal.
- if suffix not in ("jsonl.gz", "csv.tar.gz", "xlsx"):
- return HttpResponseBadRequest("Suffix not recognized")
-
- stem = "full" if full else year
- export = Export(job_id, basename=f"{stem}.{suffix}")
-
- if export.status != TaskStatus.COMPLETED:
- return HttpResponseNotFound("File not found")
-
- return FileResponse(
- export.path.open("rb"),
- as_attachment=True,
- filename=f"{spider}_{stem}.{suffix}",
- # Set Content-Encoding to skip GZipMiddleware. (ContentEncodingMiddleware removes the empty header.)
- # https://docs.djangoproject.com/en/4.2/ref/middleware/#module-django.middleware.gzip
- headers={"Content-Encoding": ""},
- )
diff --git a/tests/data_registry/test_views.py b/tests/data_registry/test_views.py
index 2709d41a..6112406a 100644
--- a/tests/data_registry/test_views.py
+++ b/tests/data_registry/test_views.py
@@ -21,7 +21,7 @@ def setUpTestData(cls):
@patch("exporter.util.Export.get_files")
def test_detail(self, get_files):
get_files.return_value = {"jsonl": {"by_year": [{"year": 2022, "size": 1}]}}
- url = f"/api/download_export?spider=paraguay_dncp_records&job_id={self.job.id}&year=2022&suffix=jsonl.gz"
+ url = f"/en/publication/{self.collection.id}/download?name=2022.jsonl.gz"
with self.assertNumQueries(2):
response = Client().get(f"/en/publication/{self.collection.id}")
diff --git a/tests/exporter/test_views.py b/tests/exporter/test_views.py
index d37e26e0..4b3bd086 100644
--- a/tests/exporter/test_views.py
+++ b/tests/exporter/test_views.py
@@ -3,36 +3,64 @@
from django.test import Client, TestCase, override_settings
+from data_registry.models import Collection
+
@override_settings(EXPORTER_DIR=os.path.join("tests", "fixtures"))
class ViewsTests(TestCase):
+ @classmethod
+ def setUp(cls):
+ cls.collection = Collection.objects.create(
+ id=2,
+ title="Dirección Nacional de Contrataciones Públicas (DNCP)",
+ source_id="abc",
+ public=True,
+ )
+ cls.job = cls.collection.job.create(
+ active=True,
+ id=2,
+ )
+ cls.collection_no_job = Collection.objects.create(
+ id=3,
+ title="Test",
+ source_id="abc",
+ public=True,
+ )
+ cls.collection_no_job.job.create(
+ active=True,
+ id=4,
+ )
+
+ def test_collection_not_found(self):
+ with self.assertNumQueries(1):
+ response = Client().get("/en/publication/10/download?name=2000.jsonl.gz")
+
+ self.assertEqual(response.status_code, 404)
+
def test_download_export_invalid_suffix(self):
with self.assertNumQueries(0):
- response = Client().get("/api/download_export?suffix=invalid")
+ response = Client().get(f"/en/publication/{self.collection.id}/download?name=invalid")
self.assertEqual(response.status_code, 400)
- self.assertEqual(response.content, b"Suffix not recognized")
+ self.assertEqual(response.content, b"The name query string parameter is invalid")
def test_download_export_empty_parameter(self):
- for parameter in ("job_id", "year"):
- with self.subTest(parameter=parameter):
- with self.assertNumQueries(0):
- response = Client().get(f"/api/download_export?suffix=jsonl.gz&{parameter}=")
+ with self.assertNumQueries(0):
+ response = Client().get(f"/en/publication/{self.collection.id}/download?name=")
- self.assertEqual(response.status_code, 404)
- self.assertEqual(response.content, b"File not found")
+ self.assertEqual(response.status_code, 400)
+ self.assertEqual(response.content, b"The name query string parameter is invalid")
def test_download_export_waiting(self):
- with self.assertNumQueries(0):
- response = Client().get("/api/download_export?suffix=jsonl.gz&year=2000&job_id=0")
-
+ with self.assertNumQueries(2):
+ response = Client().get(f"/en/publication/{self.collection_no_job.id}/download?name=2000.jsonl.gz")
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")
@patch("exporter.util.Export.lockfile", new_callable=PropertyMock)
def test_download_export_running(self, exists):
- with self.assertNumQueries(0):
- response = Client().get("/api/download_export?suffix=jsonl.gz&year=2000&job_id=1")
+ with self.assertNumQueries(2):
+ response = Client().get(f"/en/publication/{self.collection.id}/download?name=2000.jsonl.gz")
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")
@@ -44,12 +72,11 @@ def test_download_export_completed(self):
("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
):
with self.subTest(suffix=suffix):
- with self.assertNumQueries(0):
+ with self.assertNumQueries(2):
response = Client().get(
- f"/api/download_export?suffix={suffix}&year=2000&job_id=1&spider=abc",
+ f"/en/publication/{self.collection.id}/download?name=2000.{suffix}",
HTTP_ACCEPT_ENCODING="gzip",
)
-
self.assertEqual(response.status_code, 200)
response.headers.pop("Content-Length")
self.assertDictEqual(
@@ -60,10 +87,10 @@ def test_download_export_completed(self):
"Content-Type": content_type,
"Cross-Origin-Opener-Policy": "same-origin",
"Referrer-Policy": "same-origin",
- "Vary": "Accept-Language",
+ "Vary": "Cookie",
"X-Content-Type-Options": "nosniff",
"X-Frame-Options": "DENY",
},
)
- with open(os.path.join("tests", "fixtures", "1", f"2000.{suffix}"), "rb") as f:
+ with open(os.path.join("tests", "fixtures", "2", f"2000.{suffix}"), "rb") as f:
self.assertEqual(b"".join(response.streaming_content), f.read())
diff --git a/tests/fixtures/1/2000.csv.tar.gz b/tests/fixtures/2/2000.csv.tar.gz
similarity index 100%
rename from tests/fixtures/1/2000.csv.tar.gz
rename to tests/fixtures/2/2000.csv.tar.gz
diff --git a/tests/fixtures/1/2000.jsonl.gz b/tests/fixtures/2/2000.jsonl.gz
similarity index 100%
rename from tests/fixtures/1/2000.jsonl.gz
rename to tests/fixtures/2/2000.jsonl.gz
diff --git a/tests/fixtures/1/2000.xlsx b/tests/fixtures/2/2000.xlsx
similarity index 100%
rename from tests/fixtures/1/2000.xlsx
rename to tests/fixtures/2/2000.xlsx