From 0c25626e38b02cd73a589ae8ef88c01adcec275b Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Tue, 15 Aug 2023 15:36:43 +0200 Subject: [PATCH 1/2] Use the stored checksum algorithm --- src/scitacean/client.py | 17 ++++++++++------- src/scitacean/datablock.py | 6 ++---- tests/download_test.py | 30 +++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/scitacean/client.py b/src/scitacean/client.py index 0af52cd6..50506095 100644 --- a/src/scitacean/client.py +++ b/src/scitacean/client.py @@ -317,12 +317,13 @@ def download_files( - A **Callable[File]**: if this callable returns ``True`` for ``f`` checksum_algorithm: Select an algorithm for computing file checksums. - This argument will be removed when the next SciCat version - has been released. + The data block normally determines the algorithm, but this argument can + override the stored value if the data block is broken + or no algorithm has been set. force: If ``True``, download files regardless of whether they already exist locally. - This bypasses the chekcsum computation of pre-existing local files. + This bypasses the checksum computation of pre-existing local files. Returns ------- @@ -861,13 +862,15 @@ def _select_files(select: FileSelector, dataset: Dataset) -> List[File]: def _remove_up_to_date_local_files( files: List[File], checksum_algorithm: Optional[str] ) -> List[File]: + def is_up_to_date(file: File) -> bool: + if checksum_algorithm is not None: + file = dataclasses.replace(file, checksum_algorithm=checksum_algorithm) + return file.local_is_up_to_date() + return [ file for file in files if not ( - file.local_path.exists() # type: ignore[union-attr] - and dataclasses.replace( - file, checksum_algorithm=checksum_algorithm - ).local_is_up_to_date() + file.local_path.exists() and is_up_to_date(file) # type: ignore[union-attr] ) ] diff --git a/src/scitacean/datablock.py b/src/scitacean/datablock.py index 5d7cf2c8..b4aa9a5b 100644 --- a/src/scitacean/datablock.py +++ b/src/scitacean/datablock.py @@ -64,7 +64,7 @@ def from_download_model( """ dblock = orig_datablock_model return OrigDatablock( - checksum_algorithm=None, + checksum_algorithm=dblock.chkAlg, owner_group=dblock.ownerGroup, access_groups=dblock.accessGroups, instrument_group=dblock.instrumentGroup, @@ -75,9 +75,7 @@ def from_download_model( _updated_at=dblock.updatedAt, _updated_by=dblock.updatedBy, init_files=[ - File.from_download_model( - file, checksum_algorithm=orig_datablock_model.chkAlg - ) + File.from_download_model(file, checksum_algorithm=dblock.chkAlg) for file in dblock.dataFileList or () ], ) diff --git a/tests/download_test.py b/tests/download_test.py index 453b20e8..34b85bd2 100644 --- a/tests/download_test.py +++ b/tests/download_test.py @@ -273,7 +273,6 @@ def test_download_files_detects_bad_size(fs, dataset_and_files, caplog): assert "89412" in caplog.text -@pytest.mark.skip("Checksum algorithm not yet supported by datablocks") def test_download_does_not_download_up_to_date_file(fs, dataset_and_files): # Ensure the file exists locally dataset, contents = dataset_and_files @@ -328,6 +327,35 @@ def connect_for_download(self): assert all(file.local_path is not None for file in downloaded.files) +def test_override_datablock_checksum(fs, dataset_and_files): + # Ensure the file exists locally + dataset, contents = dataset_and_files + client = Client.without_login( + url="/", file_transfer=FakeFileTransfer(fs=fs, files=contents) + ) + client.download_files(dataset, target="./download", select=True) + + # Downloads the same file again when the checksum algorithm is wrong. + class RaisingDownloader(FakeFileTransfer): + source_dir = "/" + + @contextmanager + def connect_for_download(self): + raise RuntimeError("Download disabled") + + client = Client.without_login( + url="/", + file_transfer=RaisingDownloader(fs=fs), + ) + with pytest.raises(RuntimeError, match="Download disabled"): + client.download_files( + dataset, + target="./download", + select=True, + checksum_algorithm="sha256", # The datablock uses md5 + ) + + def test_force_file_download(fs, dataset_and_files): # Ensure the file exists locally dataset, contents = dataset_and_files From 5358a2fd33c3471114ec9e1ea8e34f92c505a13f Mon Sep 17 00:00:00 2001 From: Jan-Lukas Wynen Date: Tue, 15 Aug 2023 15:37:41 +0200 Subject: [PATCH 2/2] Add release note --- docs/release-notes.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/docs/release-notes.rst b/docs/release-notes.rst index 7eba3b05..2179e946 100644 --- a/docs/release-notes.rst +++ b/docs/release-notes.rst @@ -32,6 +32,33 @@ Release notes Stability, Maintainability, and Testing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +vYY.0M.MICRO (Unreleased) +------------------------- + +Security +~~~~~~~~ + +Features +~~~~~~~~ + +* File downloads now use the checksum algorithm stored in SciCat when possible. + So it is no longer necessary to specify it by hand in many cases. + +Breaking changes +~~~~~~~~~~~~~~~~ + +Bugfixes +~~~~~~~~ + +Documentation +~~~~~~~~~~~~~ + +Deprecations +~~~~~~~~~~~~ + +Stability, Maintainability, and Testing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + v23.07.0 (2023-07-07) ---------------------