From 40e2790fcd2b10dde7cf402f83f108db716376ac Mon Sep 17 00:00:00 2001 From: Dan LaManna Date: Wed, 5 Feb 2025 12:43:52 -0500 Subject: [PATCH] Add feature for frozen DOI bundles --- isic/core/admin.py | 2 +- isic/core/migrations/0005_doi_bundle.py | 17 +++++ isic/core/models/doi.py | 2 + isic/core/services/collection/doi.py | 67 ++++++++++++++++++- isic/core/tasks.py | 10 +++ .../partials/collection_detail_actions.html | 5 ++ isic/core/tests/test_doi.py | 45 ++++++++++++- 7 files changed, 144 insertions(+), 4 deletions(-) create mode 100644 isic/core/migrations/0005_doi_bundle.py diff --git a/isic/core/admin.py b/isic/core/admin.py index 9f2e948b..d20a1b58 100644 --- a/isic/core/admin.py +++ b/isic/core/admin.py @@ -201,4 +201,4 @@ class CollectionAdmin(StaffReadonlyAdmin): @admin.register(Doi) class DoiAdmin(StaffReadonlyAdmin): list_select_related = ["collection"] - list_display = ["id", "url", "collection"] + list_display = ["id", "url", "collection", "bundle"] diff --git a/isic/core/migrations/0005_doi_bundle.py b/isic/core/migrations/0005_doi_bundle.py new file mode 100644 index 00000000..65baa584 --- /dev/null +++ b/isic/core/migrations/0005_doi_bundle.py @@ -0,0 +1,17 @@ +# Generated by Django 5.1.5 on 2025-02-05 17:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("core", "0004_alter_collection_description"), + ] + + operations = [ + migrations.AddField( + model_name="doi", + name="bundle", + field=models.FileField(blank=True, null=True, upload_to="doi-bundles/"), + ), + ] diff --git a/isic/core/models/doi.py b/isic/core/models/doi.py index 2f7adcd3..8cb24721 100644 --- a/isic/core/models/doi.py +++ b/isic/core/models/doi.py @@ -13,5 +13,7 @@ class Doi(TimeStampedModel): url = models.CharField(max_length=200) + bundle = models.FileField(upload_to="doi-bundles/", null=True, blank=True) + def __str__(self): return self.id diff --git a/isic/core/services/collection/doi.py b/isic/core/services/collection/doi.py index d59f0fe2..7a113342 100644 --- a/isic/core/services/collection/doi.py +++ b/isic/core/services/collection/doi.py @@ -1,22 +1,31 @@ +import csv import logging +from pathlib import Path import random +import tempfile from typing import Any from urllib import parse +import zipfile from django.conf import settings from django.contrib.auth.models import User from django.core.exceptions import ValidationError -from django.db import transaction +from django.core.files import File +from django.db import connection, transaction +from django.template.loader import render_to_string import requests from requests.exceptions import HTTPError from isic.core.models.collection import Collection from isic.core.models.doi import Doi +from isic.core.services import image_metadata_csv from isic.core.services.collection import ( collection_get_creators_in_attribution_order, collection_lock, collection_update, ) +from isic.core.tasks import create_doi_bundle_task +from isic.zip_download.api import get_attributions logger = logging.getLogger(__name__) @@ -145,6 +154,62 @@ def collection_create_doi(*, user: User, collection: Collection) -> Doi: # retry to publish it. (May want a django-admin action for this if it ever happens.) _datacite_update_doi(doi_dict, doi_id) + create_doi_bundle_task.delay_on_commit(doi_id) + logger.info("User %d created DOI %s for collection %d", user.id, doi.id, collection.id) return doi + + +def collection_create_doi_bundle(*, doi: Doi) -> None: + """ + Create a frozen bundle of the collection associated with the DOI. + + This contains a lot of overlapping logic with the collection_download_metadata view + and the zip_file_listing view. + """ + collection = Collection.objects.select_related("doi").get(doi=doi) + + with transaction.atomic(): + cursor = connection.cursor() + cursor.execute("SET TRANSACTION ISOLATION LEVEL REPEATABLE READ") + + images = collection.images.select_related("accession").all() + + bundle_filename = f"ISIC-Collection-{doi.id.split('/')[1]}.zip" + with zipfile.ZipFile(bundle_filename, "w") as bundle: + for image in images.iterator(): + with image.accession.blob.open("rb") as blob: + bundle.writestr(f"images/{image.isic_id}.jpg", blob.read()) + + with tempfile.NamedTemporaryFile("w") as metadata_file: + collection_metadata = image_metadata_csv(qs=images) + writer = csv.DictWriter(metadata_file, fieldnames=next(collection_metadata)) + writer.writeheader() + for row in collection_metadata: + assert isinstance(row, dict) # noqa: S101 + writer.writerow(row) + metadata_file.flush() + + bundle.write(metadata_file.name, "metadata.csv") + + for license_ in ( + images.values_list("accession__copyright_license", flat=True) + .order_by() + .distinct() + ): + bundle.writestr( + f"licenses/{license_}.txt", + render_to_string(f"zip_download/{license_}.txt"), + ) + + attributions = get_attributions( + images.values_list("accession__cohort__attribution", flat=True) + ) + bundle.writestr("attribution.txt", "\n\n".join(attributions)) + + with Path(bundle_filename).open("rb") as bundle_file: + doi.bundle = File(bundle_file) + doi.save() + + Path(bundle_filename).unlink() diff --git a/isic/core/tasks.py b/isic/core/tasks.py index 91a60ff3..83a3113f 100644 --- a/isic/core/tasks.py +++ b/isic/core/tasks.py @@ -19,6 +19,7 @@ from urllib3.exceptions import ConnectionError, TimeoutError from isic.core.models.collection import Collection +from isic.core.models.doi import Doi from isic.core.models.image import Image from isic.core.search import bulk_add_to_search_index from isic.core.serializers import SearchQueryIn @@ -127,6 +128,15 @@ def generate_staff_image_list_metadata_csv(user_id: int) -> None: Path(f.name).unlink() +@shared_task(soft_time_limit=60 * 60 * 12, time_limit=(60 * 60 * 12) + 60) +def create_doi_bundle_task(doi_id: str) -> None: + from isic.core.services.collection.doi import collection_create_doi_bundle + + doi = Doi.objects.get(id=doi_id) + + collection_create_doi_bundle(doi=doi) + + @shared_task(soft_time_limit=10, time_limit=15) def prune_expired_oauth_tokens(): clear_expired_oauth_tokens() diff --git a/isic/core/templates/core/partials/collection_detail_actions.html b/isic/core/templates/core/partials/collection_detail_actions.html index 03f05764..fac3bc90 100644 --- a/isic/core/templates/core/partials/collection_detail_actions.html +++ b/isic/core/templates/core/partials/collection_detail_actions.html @@ -45,6 +45,11 @@ {% endif %}
+ {% if collection.doi.bundle %} + Download Original Collection + {% else %} + Download Original Collection + {% endif %} Download Collection diff --git a/isic/core/tests/test_doi.py b/isic/core/tests/test_doi.py index 3ddf2c1e..7191f151 100644 --- a/isic/core/tests/test_doi.py +++ b/isic/core/tests/test_doi.py @@ -1,9 +1,17 @@ +from pathlib import Path +import tempfile +import zipfile + import pytest from isic.core.forms.doi import CreateDoiForm from isic.core.models.doi import Doi from isic.core.models.image import Image -from isic.core.services.collection.doi import collection_build_doi, collection_create_doi +from isic.core.services.collection.doi import ( + collection_build_doi, + collection_create_doi, + collection_create_doi_bundle, +) @pytest.fixture() @@ -28,7 +36,7 @@ def staff_user_request(staff_user, mocker): return mocker.MagicMock(user=staff_user) -@pytest.mark.django_db() +@pytest.mark.django_db(transaction=True) def test_collection_create_doi( public_collection_with_public_images, staff_user, @@ -40,6 +48,7 @@ def test_collection_create_doi( public_collection_with_public_images.refresh_from_db() assert public_collection_with_public_images.locked assert public_collection_with_public_images.doi + assert public_collection_with_public_images.doi.bundle assert public_collection_with_public_images.doi.creator == staff_user mock_datacite_create_doi.assert_called_once() mock_datacite_update_doi.assert_called_once() @@ -179,3 +188,35 @@ def test_doi_creators_collapse_repeated_creators(collection_with_repeated_creato assert creators[1]["name"] == cohort_b.attribution assert len(creators) == 2 + + +@pytest.mark.django_db(transaction=True) +def test_doi_bundle_contains_expected_files( + image_factory, + collection_factory, + staff_user, + mock_datacite_create_doi, + mock_datacite_update_doi, +): + collection = collection_factory(public=True) + images = [image_factory(public=True) for _ in range(3)] + collection.images.set(images) + + doi = collection_create_doi(user=staff_user, collection=collection) + + collection_create_doi_bundle(doi=doi) + + with tempfile.TemporaryDirectory() as temp_dir, zipfile.ZipFile(doi.bundle) as zf: + zf.extractall(temp_dir) + + for image in images: + image_path = f"images/{image.isic_id}.jpg" + assert (Path(temp_dir) / image_path).exists() + + assert (Path(temp_dir) / "metadata.csv").exists() + + licenses = {images[0].accession.copyright_license for image in images} + for license_ in licenses: + assert (Path(temp_dir) / f"licenses/{license_}.txt").exists() + + assert (Path(temp_dir) / "attribution.txt").exists()