diff --git a/dandischema/digests/tests/test_zarr.py b/dandischema/digests/tests/test_zarr.py deleted file mode 100644 index a560c1f..0000000 --- a/dandischema/digests/tests/test_zarr.py +++ /dev/null @@ -1,326 +0,0 @@ -from __future__ import annotations - -import pytest - -from dandischema.digests.zarr import ( - ZarrChecksum, - ZarrChecksumListing, - ZarrChecksums, - ZarrJSONChecksumSerializer, - get_checksum, -) - - -def test_zarr_checksum_sort_order() -> None: - # The a < b in the path should take precedence over z > y in the checksum - a = ZarrChecksum(name="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", digest="z", size=1) - b = ZarrChecksum(name="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", digest="y", size=1) - assert sorted([b, a]) == [a, b] - - -# ZarrChecksums tests - - -def test_zarr_checkums_is_empty() -> None: - assert ZarrChecksums(directories=[], files=[]).is_empty - assert not ZarrChecksums( - directories=[ZarrChecksum(digest="checksum", name="name", size=1)], files=[] - ).is_empty - assert not ZarrChecksums( - directories=[], files=[ZarrChecksum(digest="checksum", name="name", size=1)] - ).is_empty - - -a = ZarrChecksum( - name="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", - digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", - size=1, -) -b = ZarrChecksum( - name="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", - digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", - size=1, -) -c = ZarrChecksum(name="c", digest="c", size=1) - - -@pytest.mark.parametrize( - ("initial", "new_checksums", "expected"), - [ - ([], [], []), - ([a], [], [a]), - ([], [a], [a]), - ([a], [a], [a]), - ([b], [a], [a, b]), - ([a, c], [b], [a, b, c]), - ([b], [c, a], [a, b, c]), - ], -) -def test_zarr_checkums_add_file_checksums( - initial: list[ZarrChecksum], - new_checksums: list[ZarrChecksum], - expected: list[ZarrChecksum], -) -> None: - checksums = ZarrChecksums(directories=[], files=initial) - checksums.add_file_checksums(new_checksums) - assert checksums.files == expected - assert checksums.directories == [] - - -@pytest.mark.parametrize( - ("initial", "new_checksums", "expected"), - [ - ([], [], []), - ([a], [], [a]), - ([], [a], [a]), - ([a], [a], [a]), - ([b], [a], [a, b]), - ([a, c], [b], [a, b, c]), - ([b], [c, a], [a, b, c]), - ], -) -def test_zarr_checkums_add_directory_checksums( - initial: list[ZarrChecksum], - new_checksums: list[ZarrChecksum], - expected: list[ZarrChecksum], -) -> None: - checksums = ZarrChecksums(directories=initial, files=[]) - checksums.add_directory_checksums(new_checksums) - assert checksums.directories == expected - assert checksums.files == [] - - -@pytest.mark.parametrize( - ( - "initial_files", - "initial_directories", - "removed_checksums", - "expected_files", - "expected_directories", - ), - [ - ([], [], [], [], []), - ([a], [], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], []), - ([], [a], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], []), - ([a], [b], ["aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"], [], [b]), - ([a], [b], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [a], []), - ([a, b, c], [], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [a, c], []), - ([], [a, b, c], ["bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"], [], [a, c]), - ], -) -def test_zarr_checkums_remove_checksums( - initial_files: list[ZarrChecksum], - initial_directories: list[ZarrChecksum], - removed_checksums: list[str], - expected_files: list[ZarrChecksum], - expected_directories: list[ZarrChecksum], -) -> None: - checksums = ZarrChecksums(files=initial_files, directories=initial_directories) - checksums.remove_checksums(removed_checksums) - assert checksums.files == expected_files - assert checksums.directories == expected_directories - - -# ZarrJSONChecksumSerializer tests - - -@pytest.mark.parametrize( - "file_checksums,directory_checksums,digest", - [ - ([], [], "481a2f77ab786a0f45aafd5db0971caa-0--0"), - ( - [ - ZarrChecksum( - name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 - ) - ], - [], - "f21b9b4bf53d7ce1167bcfae76371e59-1--1", - ), - ( - [], - [ - ZarrChecksum( - name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", size=1 - ) - ], - "ea8b8290b69b96422a3ed1cca0390f21-1--1", - ), - ( - [ - ZarrChecksum( - name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 - ), - ZarrChecksum( - name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", size=1 - ), - ], - [], - "8e50add2b46d3a6389e2d9d0924227fb-2--2", - ), - ( - [], - [ - ZarrChecksum( - name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", size=1 - ), - ZarrChecksum( - name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", size=1 - ), - ], - "4c21a113688f925240549b14136d61ff-2--2", - ), - ( - [ - ZarrChecksum( - name="baz", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 - ) - ], - [ - ZarrChecksum( - name="bar", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", size=1 - ) - ], - "d5e4eb5dc8efdb54ff089db1eef34119-2--2", - ), - ], -) -def test_zarr_checksum_serializer_aggregate_digest( - file_checksums: list[ZarrChecksum], - directory_checksums: list[ZarrChecksum], - digest: str, -) -> None: - serializer = ZarrJSONChecksumSerializer() - assert ( - serializer.aggregate_digest( - ZarrChecksums(files=file_checksums, directories=directory_checksums) - ) - == digest - ) - - -def test_zarr_checksum_serializer_generate_listing() -> None: - serializer = ZarrJSONChecksumSerializer() - checksums = ZarrChecksums( - files=[ - ZarrChecksum(name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1) - ], - directories=[ - ZarrChecksum( - name="baz", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", size=2 - ) - ], - ) - assert serializer.generate_listing(checksums) == ZarrChecksumListing( - checksums=checksums, - digest="baf791d7bac84947c14739b1684ec5ab-2--3", - size=3, - ) - - -def test_zarr_serialize() -> None: - serializer = ZarrJSONChecksumSerializer() - assert ( - serializer.serialize( - ZarrChecksumListing( - checksums=ZarrChecksums( - files=[ - ZarrChecksum( - name="bar", - digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", - size=1, - ) - ], - directories=[ - ZarrChecksum( - name="foo", - digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", - size=2, - ) - ], - ), - digest="cccccccccccccccccccccccccccccccc-2--3", - size=3, - ) - ) - == '{"checksums":{"directories":[{"digest":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2","name":"foo","size":2}],"files":[{"digest":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","name":"bar","size":1}]},"digest":"cccccccccccccccccccccccccccccccc-2--3","size":3}' # noqa: E501 - ) - - -def test_zarr_deserialize() -> None: - serializer = ZarrJSONChecksumSerializer() - assert serializer.deserialize( - '{"checksums":{"directories":[{"digest":"bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2","name":"foo","size":2}],"files":[{"digest":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa","name":"bar","size":1}]},"digest":"cccccccccccccccccccccccccccccccc-2--3","size":3}' # noqa: E501 - ) == ZarrChecksumListing( - checksums=ZarrChecksums( - files=[ - ZarrChecksum( - name="bar", digest="aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", size=1 - ) - ], - directories=[ - ZarrChecksum( - name="foo", digest="bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", size=2 - ) - ], - ), - digest="cccccccccccccccccccccccccccccccc-2--3", - size=3, - ) - - -@pytest.mark.parametrize( - "files,directories,checksum", - [ - ( - {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, - {}, - "f21b9b4bf53d7ce1167bcfae76371e59-1--1", - ), - ( - {}, - {"bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1)}, - "ea8b8290b69b96422a3ed1cca0390f21-1--1", - ), - ( - { - "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1), - "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", 2), - }, - {}, - "4e67de4393d14c1e9c472438f0f1f8b1-2--3", - ), - ( - {}, - { - "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--1", 1), - "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2), - }, - "859ca1926affe9c7d0424030f26fbd89-2--3", - ), - ( - {}, - { - "baz": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--1", 1), - "bar": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-1--2", 2), - }, - "8f8361a286c9a7c3fbfd464e33989037-2--3", - ), - ( - {"baz": ("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 1)}, - {"bar": ("bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-1--2", 2)}, - "3cb139f47d3a3580388f41956c15f55e-2--3", - ), - ], -) -def test_zarr_get_checksum( - files: dict[str, tuple[str, int]], - directories: dict[str, tuple[str, int]], - checksum: str, -) -> None: - assert get_checksum(files=files, directories=directories) == checksum - - -def test_zarr_get_checksum_empty() -> None: - with pytest.raises(ValueError): - get_checksum(files={}, directories={}) diff --git a/dandischema/digests/zarr.py b/dandischema/digests/zarr.py deleted file mode 100644 index 3f0257f..0000000 --- a/dandischema/digests/zarr.py +++ /dev/null @@ -1,190 +0,0 @@ -from __future__ import annotations - -from functools import total_ordering -import hashlib -import json -import re -from typing import Dict, List, Optional, Tuple - -import pydantic - -"""Passed to the json() method of pydantic models for serialization.""" -ENCODING_KWARGS = {"separators": (",", ":")} -ZARR_CHECKSUM_PATTERN = "([0-9a-f]{32})-([0-9]+)--([0-9]+)" - - -def generate_directory_digest(md5: str, file_count: int, size: int) -> str: - """Generate a directory digest from its constituent parts""" - return f"{md5}-{file_count}--{size}" - - -def parse_directory_digest(digest: str) -> tuple[str, int, int]: - """Parse a directory digest into its constituent parts""" - match = re.match(ZARR_CHECKSUM_PATTERN, digest) - if match is None: - raise ValueError(f"Cannot parse directory digest {digest}") - return match.group(1), int(match.group(2)), int(match.group(3)) - - -@total_ordering -class ZarrChecksum(pydantic.BaseModel): - """ - A checksum for a single file/directory in a zarr file. - - Every file and directory in a zarr archive has a name, digest, and size. - """ - - digest: str - name: str - size: int - - # To make ZarrChecksums sortable - def __lt__(self, other: ZarrChecksum) -> bool: - return self.name < other.name - - -class ZarrChecksums(pydantic.BaseModel): - """ - A set of file and directory checksums. - - This is the data hashed to calculate the checksum of a directory. - """ - - directories: List[ZarrChecksum] = pydantic.Field(default_factory=list) - files: List[ZarrChecksum] = pydantic.Field(default_factory=list) - - @property - def is_empty(self) -> bool: - return self.files == [] and self.directories == [] - - def _index(self, checksums: List[ZarrChecksum], checksum: ZarrChecksum) -> int: - # O(n) performance, consider using the bisect module or an ordered dict for optimization - for i in range(0, len(checksums)): - if checksums[i].name == checksum.name: - return i - raise ValueError("Not found") - - def add_file_checksums(self, checksums: List[ZarrChecksum]) -> None: - for new_checksum in checksums: - try: - self.files[self._index(self.files, new_checksum)] = new_checksum - except ValueError: - self.files.append(new_checksum) - self.files = sorted(self.files) - - def add_directory_checksums(self, checksums: List[ZarrChecksum]) -> None: - """Add a list of directory checksums to the listing.""" - for new_checksum in checksums: - try: - self.directories[self._index(self.directories, new_checksum)] = ( - new_checksum - ) - except ValueError: - self.directories.append(new_checksum) - self.directories = sorted(self.directories) - - def remove_checksums(self, names: List[str]) -> None: - """Remove a list of names from the listing.""" - self.files = sorted( - filter(lambda checksum: checksum.name not in names, self.files) - ) - self.directories = sorted( - filter(lambda checksum: checksum.name not in names, self.directories) - ) - - -class ZarrChecksumListing(pydantic.BaseModel): - """ - A listing of checksums for all sub-files/directories in a zarr directory. - - This is the data serialized in the checksum file. - """ - - checksums: ZarrChecksums - digest: str - size: int - - -class ZarrJSONChecksumSerializer: - def aggregate_digest(self, checksums: ZarrChecksums) -> str: - """Generate an aggregated digest for a list of ZarrChecksums.""" - # Use the most compact separators possible - # content = json.dumps([asdict(zarr_md5) for zarr_md5 in checksums], separators=(',', ':'))0 - content = json.dumps( - checksums.model_dump(mode="json"), - **ENCODING_KWARGS, # type: ignore[arg-type] - ) - h = hashlib.md5() - h.update(content.encode("utf-8")) - md5 = h.hexdigest() - file_count = sum( - parse_directory_digest(checksum.digest)[1] - for checksum in checksums.directories - ) + len(checksums.files) - size = sum(file.size for file in checksums.files) + sum( - directory.size for directory in checksums.directories - ) - return generate_directory_digest(md5, file_count, size) - - def serialize(self, zarr_checksum_listing: ZarrChecksumListing) -> str: - """Serialize a ZarrChecksumListing into a string.""" - # return json.dumps(asdict(zarr_checksum_listing)) - return json.dumps( - zarr_checksum_listing.model_dump(mode="json"), - **ENCODING_KWARGS, # type: ignore[arg-type] - ) - - def deserialize(self, json_str: str) -> ZarrChecksumListing: - """Deserialize a string into a ZarrChecksumListing.""" - # listing = ZarrChecksumListing(**json.loads(json_str)) - # listing.checksums = [ZarrChecksum(**checksum) for checksum in listing.checksums] - # return listing - return ZarrChecksumListing.model_validate_json(json_str) - - def generate_listing( - self, - checksums: Optional[ZarrChecksums] = None, - files: Optional[List[ZarrChecksum]] = None, - directories: Optional[List[ZarrChecksum]] = None, - ) -> ZarrChecksumListing: - """ - Generate a new ZarrChecksumListing from the given checksums. - - This method wraps aggregate_checksum and should not be overridden. - """ - if checksums is None: - checksums = ZarrChecksums( - files=sorted(files) if files is not None else [], - directories=sorted(directories) if directories is not None else [], - ) - digest = self.aggregate_digest(checksums) - return ZarrChecksumListing( - checksums=checksums, - digest=digest, - size=parse_directory_digest(digest)[2], - ) - - -# We do not store a checksum file for empty directories since an empty directory doesn't exist in -# S3. However, an empty zarr file still needs to have a checksum, even if it has no checksum file. -# For convenience, we define this constant as the "null" checksum. -EMPTY_CHECKSUM = ZarrJSONChecksumSerializer().generate_listing(ZarrChecksums()).digest - - -def get_checksum( - files: Dict[str, Tuple[str, int]], directories: Dict[str, Tuple[str, int]] -) -> str: - """Calculate the checksum of a directory.""" - if not files and not directories: - raise ValueError("Cannot compute a Zarr checksum for an empty directory") - checksum_listing = ZarrJSONChecksumSerializer().generate_listing( - files=[ - ZarrChecksum(digest=digest, name=name, size=size) - for name, (digest, size) in files.items() - ], - directories=[ - ZarrChecksum(digest=digest, name=name, size=size) - for name, (digest, size) in directories.items() - ], - ) - return checksum_listing.digest diff --git a/dandischema/models.py b/dandischema/models.py index 04d5118..20af8bd 100644 --- a/dandischema/models.py +++ b/dandischema/models.py @@ -20,10 +20,10 @@ ) from pydantic.json_schema import JsonSchemaValue from pydantic_core import CoreSchema +from zarr_checksum.checksum import InvalidZarrChecksum, ZarrDirectoryDigest from .consts import DANDI_SCHEMA_VERSION from .digests.dandietag import DandiETag -from .digests.zarr import ZARR_CHECKSUM_PATTERN, parse_directory_digest from .types import ByteSizeJsonSchema from .utils import name2title @@ -1592,12 +1592,14 @@ def digest_check( if v.get(DigestType.dandi_etag): raise ValueError("Digest cannot have both etag and zarr checksums.") digest = v[DigestType.dandi_zarr_checksum] - if not re.fullmatch(ZARR_CHECKSUM_PATTERN, digest): + try: + chksum = ZarrDirectoryDigest.parse(digest) + except InvalidZarrChecksum: raise ValueError( - f"Digest must have an appropriate dandi-zarr-checksum value. " - f"Got {digest}" + "Digest must have an appropriate dandi-zarr-checksum value." + f" Got {digest}" ) - _checksum, _file_count, zarr_size = parse_directory_digest(digest) + zarr_size = chksum.size content_size = values.get("contentSize") if content_size != zarr_size: raise ValueError( diff --git a/setup.cfg b/setup.cfg index 54b2388..587a86c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -32,6 +32,7 @@ install_requires = jsonschema[format] pydantic[email] ~= 2.4 requests + zarr_checksum zip_safe = False packages = find_namespace: include_package_data = True