Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

232 export files improvements #314

Merged
merged 6 commits into from
Nov 10, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,12 @@
"django.contrib.humanize",
"data_registry",
"markdownx",
"exporter",
]

MIDDLEWARE = [
"django.middleware.security.SecurityMiddleware",
# Add before GZipMiddleware to modify its response.
"exporter.middleware.ContentEncodingMiddleware",
"data_registry.middleware.ContentEncodingMiddleware",
# This site is not affected by BREACH.
# https://docs.djangoproject.com/en/4.2/ref/middleware/#django.middleware.gzip.GZipMiddleware
"django.middleware.gzip.GZipMiddleware",
Expand Down
1 change: 0 additions & 1 deletion core/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

urlpatterns = [
path("", include("data_registry.urls"), name="data-registry"),
path("", include("exporter.urls"), name="exporter"),
path("admin/", admin.site.urls),
path("markdownx/", include("markdownx.urls")),
]
Expand Down
File renamed without changes.
4 changes: 2 additions & 2 deletions data_registry/templates/includes/files.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
<ul>
{% if files.full %}
<li>
<a href="{% url 'download-export' %}?spider={{ collection.source_id }}&job_id={{ job.id }}&full=true&suffix={{ suffix }}" rel="nofollow" download>{% translate "All time" %}</a>
<a href="{% url 'download' collection.id %}?name=full.{{ suffix }}" rel="nofollow" download>{% translate "All time" %}</a>
<span class="text-muted small">{{ files.full|humanfilesize }}</span>
</li>
{% endif %}
{% for file in files.by_year|dictsortreversed:"year" %}
<li>
<a href="{% url 'download-export' %}?spider={{ collection.source_id }}&job_id={{ job.id }}&year={{ file.year }}&suffix={{ suffix }}" rel="nofollow" download>{{ file.year }}</a>
<a href="{% url 'download' collection.id %}?name={{ file.year }}.{{ suffix }}" rel="nofollow" download>{{ file.year }}</a>
<span class="text-muted small">{{ file.size|humanfilesize }}</span>
</li>
{% empty %}
Expand Down
1 change: 1 addition & 0 deletions data_registry/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
path("", views.index, name="index"),
path("search/", views.search, name="search"),
path("publication/<int:id>", views.detail, name="detail"),
path("publication/<int:id>/download", views.download_export, name="download"),
# https://code.djangoproject.com/ticket/26556
path("i18n/setlang/", i18n.set_language, name="set-language"),
# Uncomment after re-integrating Spoonbill.
Expand Down
30 changes: 28 additions & 2 deletions data_registry/views.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
import string
from collections import defaultdict
from datetime import date, datetime, timedelta
Expand All @@ -11,20 +12,21 @@
from django.contrib.postgres.aggregates import ArrayAgg
from django.db.models import Count, OuterRef, Q, Subquery
from django.db.models.functions import Substr
from django.http.response import HttpResponse, JsonResponse
from django.http.response import FileResponse, HttpResponse, HttpResponseBadRequest, HttpResponseNotFound, JsonResponse
from django.shortcuts import get_object_or_404, redirect, render
from django.utils.translation import get_language, get_language_from_request
from django.utils.translation import gettext as _

from data_registry.models import Collection, Job
from data_registry.util import collection_queryset
from exporter.util import Export
from exporter.util import Export, TaskStatus

logger = logging.getLogger(__name__)

alphabets = defaultdict(lambda: string.ascii_uppercase)
# https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode#Basic_Cyrillic_alphabet
alphabets["ru"] = "АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ"
EXPORT_PATTERN = re.compile(r"\A(full|\d{4})\.(jsonl\.gz|csv\.tar\.gz|xlsx)\Z")


def index(request):
Expand Down Expand Up @@ -174,6 +176,30 @@ def spiders(request):
return JsonResponse(json.get("spiders"), safe=False)


def download_export(request, id):
name = request.GET.get("name")

# Guard against path traversal.
if not EXPORT_PATTERN.match(name):
return HttpResponseBadRequest("The name query string parameter is invalid")

collection = get_object_or_404(collection_queryset(request), id=id)
active_job = get_object_or_404(collection.job, active=True)

export = Export(active_job.id, basename=name)
if export.status != TaskStatus.COMPLETED:
return HttpResponseNotFound("File not found")

return FileResponse(
export.path.open("rb"),
as_attachment=True,
filename=f"{collection.source_id}_{name}",
# Set Content-Encoding to skip GZipMiddleware. (ContentEncodingMiddleware removes the empty header.)
# https://docs.djangoproject.com/en/4.2/ref/middleware/#module-django.middleware.gzip
headers={"Content-Encoding": ""},
)


def excel_data(request, job_id, job_range=None):
job = Job.objects.get(id=job_id)
export = Export(job_id)
Expand Down
5 changes: 0 additions & 5 deletions exporter/apps.py

This file was deleted.

7 changes: 0 additions & 7 deletions exporter/urls.py

This file was deleted.

34 changes: 0 additions & 34 deletions exporter/views.py

This file was deleted.

2 changes: 1 addition & 1 deletion tests/data_registry/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def setUpTestData(cls):
@patch("exporter.util.Export.get_files")
def test_detail(self, get_files):
get_files.return_value = {"jsonl": {"by_year": [{"year": 2022, "size": 1}]}}
url = f"/api/download_export?spider=paraguay_dncp_records&job_id={self.job.id}&year=2022&suffix=jsonl.gz"
url = f"/en/publication/{self.collection.id}/download?name=2022.jsonl.gz"

with self.assertNumQueries(2):
response = Client().get(f"/en/publication/{self.collection.id}")
Expand Down
63 changes: 45 additions & 18 deletions tests/exporter/test_views.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,36 +3,64 @@

from django.test import Client, TestCase, override_settings

from data_registry.models import Collection


@override_settings(EXPORTER_DIR=os.path.join("tests", "fixtures"))
class ViewsTests(TestCase):
@classmethod
def setUp(cls):
cls.collection = Collection.objects.create(
id=2,
title="Dirección Nacional de Contrataciones Públicas (DNCP)",
source_id="abc",
public=True,
)
cls.job = cls.collection.job.create(
active=True,
id=2,
)
cls.collection_no_job = Collection.objects.create(
id=3,
title="Test",
source_id="abc",
public=True,
)
cls.collection_no_job.job.create(
active=True,
id=4,
)

def test_collection_not_found(self):
with self.assertNumQueries(1):
response = Client().get("/en/publication/10/download?name=2000.jsonl.gz")

self.assertEqual(response.status_code, 404)

def test_download_export_invalid_suffix(self):
with self.assertNumQueries(0):
response = Client().get("/api/download_export?suffix=invalid")
response = Client().get(f"/en/publication/{self.collection.id}/download?name=invalid")

self.assertEqual(response.status_code, 400)
self.assertEqual(response.content, b"Suffix not recognized")
self.assertEqual(response.content, b"The name query string parameter is invalid")

def test_download_export_empty_parameter(self):
for parameter in ("job_id", "year"):
with self.subTest(parameter=parameter):
with self.assertNumQueries(0):
response = Client().get(f"/api/download_export?suffix=jsonl.gz&{parameter}=")
with self.assertNumQueries(0):
response = Client().get(f"/en/publication/{self.collection.id}/download?name=")

self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")
self.assertEqual(response.status_code, 400)
self.assertEqual(response.content, b"The name query string parameter is invalid")

def test_download_export_waiting(self):
with self.assertNumQueries(0):
response = Client().get("/api/download_export?suffix=jsonl.gz&year=2000&job_id=0")

with self.assertNumQueries(2):
response = Client().get(f"/en/publication/{self.collection_no_job.id}/download?name=2000.jsonl.gz")
self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")
jpmckinney marked this conversation as resolved.
Show resolved Hide resolved

@patch("exporter.util.Export.lockfile", new_callable=PropertyMock)
def test_download_export_running(self, exists):
with self.assertNumQueries(0):
response = Client().get("/api/download_export?suffix=jsonl.gz&year=2000&job_id=1")
with self.assertNumQueries(2):
response = Client().get(f"/en/publication/{self.collection.id}/download?name=2000.jsonl.gz")

self.assertEqual(response.status_code, 404)
self.assertEqual(response.content, b"File not found")
Expand All @@ -44,12 +72,11 @@ def test_download_export_completed(self):
("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
):
with self.subTest(suffix=suffix):
with self.assertNumQueries(0):
with self.assertNumQueries(2):
response = Client().get(
f"/api/download_export?suffix={suffix}&year=2000&job_id=1&spider=abc",
f"/en/publication/{self.collection.id}/download?name=2000.{suffix}",
HTTP_ACCEPT_ENCODING="gzip",
)

self.assertEqual(response.status_code, 200)
response.headers.pop("Content-Length")
self.assertDictEqual(
Expand All @@ -60,10 +87,10 @@ def test_download_export_completed(self):
"Content-Type": content_type,
"Cross-Origin-Opener-Policy": "same-origin",
"Referrer-Policy": "same-origin",
"Vary": "Accept-Language",
"Vary": "Cookie",
"X-Content-Type-Options": "nosniff",
"X-Frame-Options": "DENY",
},
)
with open(os.path.join("tests", "fixtures", "1", f"2000.{suffix}"), "rb") as f:
with open(os.path.join("tests", "fixtures", "2", f"2000.{suffix}"), "rb") as f:
self.assertEqual(b"".join(response.streaming_content), f.read())
File renamed without changes.
File renamed without changes.
File renamed without changes.