Merge pull request #13 from janelia-cellmap/chunks_bugfix

chunks bugfix
janelia-cellmap · Jun 14, 2024 · c0677a3 · c0677a3
2 parents 352feac + 641d298
commit c0677a3
Show file tree

Hide file tree

Showing 10 changed files with 203 additions and 58 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -7,18 +7,27 @@ on:
     branches: [ main ]
 
 jobs:
-  build:
+  test:
+    name: py=${{ matrix.python-version }}
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12']
+        python-version: ['3.9', '3.10', '3.11']
     steps:
     - uses: actions/checkout@v4
-    - name: Install dependencies
-      shell: "bash -l {0}"
+    - name: Set up Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: 'pip'
+    - name: Install Hatch
       run: |
-        pip install poetry
-        poetry install
-    - name: Test
+        python -m pip install --upgrade pip 
+        pip install hatch
+    - name: Set Up Hatch Env
       run: |
-        poetry run pytest
+        hatch env create test.py${{ matrix.python-version }}
+        hatch env run -e test.py${{ matrix.python-version }} pip list
+    - name: Run Tests
+      run: |
+        hatch env run --env test.py${{ matrix.python-version }} run
diff --git a/docs/index.md b/docs/index.md
@@ -72,7 +72,7 @@ print(group_model.model_dump(exclude='attributes'))
                 'pixelResolution': {'dimensions': (10.0, 8.0), 'unit': 'nm'}
             },
             'shape': (2, 2),
-            'chunks': (2, 2),
+            'chunks': (4, 4),
             'dtype': '<i8',
             'fill_value': 0,
             'order': 'C',

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,38 +1,88 @@
-[tool.poetry]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
 name = "cellmap-schemas"
-version = "0.7.0"
-description = "Schemas for data used by the Cellmap project team at Janelia Research Campus."
-authors = ["Davis Vann Bennett <[email protected]>"]
+dynamic = ["version"]
+description = 'Schemas for data used by the Cellmap project team at Janelia Research Campus.'
 readme = "README.md"
-packages = [{include = "cellmap_schemas", from = "src"}]
+requires-python = ">=3.9"
+license = "MIT"
+keywords = ["cellmap", "ngff", "n5", "zarr"]
+authors = [
+  { name = "Davis Vann Bennett", email = "[email protected]" },
+]
+classifiers = [
+  "Development Status :: 4 - Beta",
+  "Programming Language :: Python",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Programming Language :: Python :: 3.11",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: Implementation :: CPython",
+  "Programming Language :: Python :: Implementation :: PyPy",
+]
+dependencies = [
+    "pydantic-zarr >= 0.7.0",
+    "s3fs >= 2023.10.0",
+    "rich >= 13.7.0"
+]
 
-[tool.poetry.dependencies]
-python = "^3.9"
-pydantic-zarr = "^0.7.0"
-s3fs = "^2023.10.0"
-rich = "^13.7.0"
+[project.urls]
+Documentation = "https:www.janelia-cellmap.github.io/cellmap-schemas"
+Issues = "https://github.com/janelia-cellmap/cellmap-schemas/issues"
+Source = "https://github.com/janelia-cellmap/cellmap-schemas"
 
+[tool.hatch.version]
+path = "src/cellmap_schemas/__about__.py"
 
+[tool.hatch.envs.test]
+dependencies = [
+  "pytest",
+  "pytest-cov==5.0.0",
+  "pytest-examples == 0.0.10"
+]
 
-[tool.poetry.group.docs.dependencies]
-mkdocs-material = "^9.4.2"
-mkdocstrings = {extras = ["python"], version = "^0.23.0"}
+[[tool.hatch.envs.test.matrix]]
+python = ["3.9", "3.10", "3.11"]
 
-[tool.poetry.group.test.dependencies]
-pytest = "^7.4.2"
-coverage = "^7.4.0"
+[tool.hatch.envs.test.scripts]
+run-coverage = "pytest --cov-config=pyproject.toml --cov=pkg --cov=tests"
+run = "run-coverage --no-cov"
 
-[tool.poetry.group.dev.dependencies]
-mypy = "^1.7.1"
-pytest-examples = "^0.0.10"
 
+[tool.hatch.envs.docs]
+dependencies = [
+"mkdocs-material == 9.4.2",
+"mkdocstrings[python] == 0.23.0"
+]
 
-[build-system]
-requires = ["poetry-core"]
-build-backend = "poetry.core.masonry.api"
+[tool.hatch.envs.types]
+extra-dependencies = [
+  "mypy>=1.0.0",
+]
+[tool.hatch.envs.types.scripts]
+check = "mypy --install-types --non-interactive {args:src/cellmap_schemas tests}"
+
+[tool.coverage.run]
+source_pkgs = ["cellmap_schemas", "tests"]
+branch = true
+parallel = true
+omit = [
+  "src/cellmap_schemas/__about__.py",
+]
+
+[tool.coverage.paths]
+cellmap_schemas = ["src/cellmap_schemas", "*/cellmap-schemas/src/cellmap_schemas"]
+tests = ["tests", "*/cellmap-schemas/tests"]
 
-[tool.poetry.scripts]
-cellmap-schemas = 'cellmap_schemas.cli:cli'
+[tool.coverage.report]
+exclude_lines = [
+  "no cov",
+  "if __name__ == .__main__.:",
+  "if TYPE_CHECKING:",
+]
 
 [tool.ruff]
 line-length = 100
@@ -42,4 +92,4 @@ convention = "numpy"
 
 [tool.ruff.format]
 quote-style = "double"
-indent-style = "space"
+indent-style = "space"
diff --git a/src/cellmap_schemas/__about__.py b/src/cellmap_schemas/__about__.py
@@ -0,0 +1,4 @@
+# SPDX-FileCopyrightText: 2022-present Howard Hughes Medical Institute
+#
+# SPDX-License-Identifier: MIT
+__version__ = "0.7.0"
diff --git a/src/cellmap_schemas/base.py b/src/cellmap_schemas/base.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
+import numpy.typing as npt
+from typing import Any, Iterable, Literal
 from pydantic_zarr.v2 import ArraySpec, GroupSpec
+from pydantic_zarr.v2 import auto_chunks
 
 
 def structure_group_equal(spec_a: GroupSpec, spec_b: GroupSpec) -> bool:
@@ -33,3 +36,41 @@ def structure_equal(spec_a: ArraySpec | GroupSpec, spec_b: ArraySpec | GroupSpec
         return structure_group_equal(spec_a, spec_b)
     else:
         return False
+
+
+def normalize_chunks(
+    chunks: Literal["auto"] | tuple[int, ...] | tuple[tuple[int, ...]],
+    arrays: Iterable[npt.NDArray[Any]],
+) -> tuple[tuple[int, ...], ...]:
+    """
+    Normalize a specification of chunks against a collection of arrays. Returns a tuple of tuples of ints.
+
+    This handles 3 cases:
+    - If `chunks` is the string "auto", then for each array in `arrays`, the `auto_chunks` routine
+    from `pydantic-zarr` is used to estimate a chunk size for the first (largest) array, and that chunk size
+    is used for all arrays
+    - If `chunks` is a tuple of integers, then that is used as the chunk size for all arrays
+    - If `chunks` is a tuple of tuples of integers, then that is returned after some minor
+    validation.
+    """
+    arrays_tuple = tuple(arrays)
+
+    if chunks == "auto":
+        return (auto_chunks(arrays_tuple[0]),) * len(arrays)
+    elif all(isinstance(x, int) for x in chunks):
+        return (chunks,) * len(arrays_tuple)
+    elif all(all(isinstance(x, int) for x in t) for t in chunks):
+        result = tuple(map(tuple, chunks))
+        if len(result) != len(arrays_tuple):
+            msg = (
+                f"The number of chunks ({len(chunks)}) does not match the number of "
+                f"arrays ({len(arrays_tuple)})"
+            )
+            raise ValueError(msg)
+        return result
+    else:
+        msg = (
+            f'Invalid chunks: {chunks}. Expected the string "auto"'
+            "a tuple of ints, or a tuple of tuples of ints."
+        )
+        raise ValueError(msg)
diff --git a/src/cellmap_schemas/multiscale/cosem.py b/src/cellmap_schemas/multiscale/cosem.py
@@ -9,6 +9,8 @@
 from __future__ import annotations
 from typing import Annotated, Any, Literal, Optional, Sequence, TYPE_CHECKING
 
+from cellmap_schemas.base import normalize_chunks
+
 if TYPE_CHECKING:
     from typing import Type
     from typing_extensions import Self
@@ -394,15 +396,18 @@ def from_arrays(
         compressor: Codec | None | Literal["auto"] = "auto"
             The compressor to use for the Zarr arrays.
         """
+
+        chunks = normalize_chunks(chunks, arrays)
+
         members = {
             path: Array.from_array(
                 array=array,
-                chunks=chunks,
+                chunks=chunk,
                 compressor=compressor,
                 dimension_separator="/",
                 attributes=ArrayMetadata.from_transform(transform),
             )
-            for path, array, transform in zip(paths, arrays, transforms)
+            for path, array, transform, chunk in zip(paths, arrays, transforms, chunks)
         }
 
         metadata = GroupMetadata.from_transforms(

diff --git a/src/cellmap_schemas/multiscale/neuroglancer_n5.py b/src/cellmap_schemas/multiscale/neuroglancer_n5.py
@@ -6,6 +6,8 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING, Any, Sequence
 
+from cellmap_schemas.base import normalize_chunks
+
 if TYPE_CHECKING:
     from typing_extensions import Self, Type, Literal
     from numcodecs.abc import Codec
@@ -212,6 +214,8 @@ def from_arrays(
             entirely by Zarr.
         """
 
+        chunks = normalize_chunks(chunks, arrays)
+
         if dimension_order == "C":
             # this will reverse the order of axes, units, and scales before writing metadata
             indexer = slice(-1, None, -1)
@@ -239,10 +243,10 @@ def from_arrays(
                     pixelResolution=PixelResolution(dimensions=scale, unit=units_parsed[0])
                 ),
                 array=array,
-                chunks=chunks,
+                chunks=chunk,
                 compressor=compressor,
             )
-            for path, array, scale in zip(paths, arrays, scales_parsed)
+            for path, array, chunk, scale in zip(paths, arrays, chunks, scales_parsed)
         }
         return cls(members=members, attributes=attributes)
 

diff --git a/tests/test_base.py b/tests/test_base.py
@@ -1,7 +1,11 @@
 from __future__ import annotations
-from cellmap_schemas.base import structure_equal
-
+import re
+from typing import Literal
 
+import numpy as np
+import pytest
+from cellmap_schemas.base import normalize_chunks, structure_equal
+from pydantic_zarr.v2 import auto_chunks
 from pydantic_zarr.v2 import ArraySpec, GroupSpec
 
 
@@ -33,3 +37,27 @@ def test_structure_equal():
 
     group_e = group_a.model_copy(deep=True, update={"members": {"array": array_b}})
     assert not structure_equal(group_a, group_e)
+
+
+@pytest.mark.parametrize("chunks", ("auto", (1, 2, 3), ((1, 2, 3), (1, 2, 3), (2, 3, 4))))
+def test_normalize_chunks(chunks: Literal["auto"] | tuple[int, ...] | tuple[tuple[int, ...], ...]):
+    data = (np.zeros((10, 10, 10)),) * 3
+    if chunks == "auto":
+        expected = (auto_chunks(data[0]),) * len(data)
+    elif isinstance(chunks[0], int):
+        expected = (chunks,) * len(data)
+    else:
+        expected = chunks
+    observed = normalize_chunks(chunks, data)
+    assert observed == expected
+
+
+def test_normalize_chunks_wrong_length():
+    arrays = (np.zeros((1, 1, 1)),) * 2
+    chunks = ((1, 1, 1),)
+    match = (
+        f"The number of chunks ({len(chunks)}) does not match the number of "
+        f"arrays ({len(arrays)})"
+    )
+    with pytest.raises(ValueError, match=re.escape(match)):
+        normalize_chunks(chunks, arrays)
diff --git a/tests/test_multiscale/test_cosem.py b/tests/test_multiscale/test_cosem.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 from typing import TYPE_CHECKING
 
+from cellmap_schemas.base import normalize_chunks
+
 if TYPE_CHECKING:
     from typing import Literal
 
@@ -21,7 +23,6 @@
     change_coordinates,
 )
 from cellmap_schemas.multiscale.neuroglancer_n5 import PixelResolution
-from zarr.util import guess_chunks
 
 
 @pytest.mark.parametrize("ndim", (2, 3, 4))
@@ -281,7 +282,7 @@ def test_multiscale_group_from_arrays(order: Literal["C", "F"], name: str | None
     }
 
     arrays = {"s0": np.zeros((10, 10, 10)), "s1": np.zeros((5, 5, 5))}
-
+    chunks_expected = normalize_chunks("auto", (arrays["s0"],))[0]
     groupMeta = GroupMetadata(
         axes=axes[reorder],
         scales=[[1, 1, 1], [2, 2, 2]],
@@ -294,7 +295,9 @@ def test_multiscale_group_from_arrays(order: Literal["C", "F"], name: str | None
         attributes=groupMeta,
         members={
             key: Array.from_array(
-                array=array, attributes=ArrayMetadata.from_transform(transform=transforms[key])
+                array=array,
+                chunks=chunks_expected,
+                attributes=ArrayMetadata.from_transform(transform=transforms[key]),
             )
             for key, array in arrays.items()
         },
@@ -409,7 +412,7 @@ def test_change_coordinates_3d(
 
 
 @pytest.mark.parametrize("dimension_order", ("C", "F"))
-@pytest.mark.parametrize("chunks", ("auto", ((2, 2, 2))))
+@pytest.mark.parametrize("chunks", ("auto", (2, 2, 2), ((1, 1, 1), (2, 2, 2), (3, 3, 3))))
 @pytest.mark.parametrize("compressor", (Zstd(3), GZip(-1)))
 def test_from_arrays(
     dimension_order: Literal["C", "F"],
@@ -441,14 +444,10 @@ def test_from_arrays(
     assert group.attributes.pixelResolution == PixelResolution(
         dimensions=scales[0][indexer], unit=units[indexer][0]
     )
+    chunks_expected = normalize_chunks(chunks, arrays)
     for idx in range(len(arrays)):
         obs = group.members[paths[idx]]
-        exp = arrays[idx]
-        if chunks == "auto":
-            chunks_expected = guess_chunks(exp.shape, exp.dtype.itemsize)
-        else:
-            chunks_expected = chunks
-        assert obs.chunks == chunks_expected
+        assert obs.chunks == chunks_expected[idx]
 
         assert obs.attributes.pixelResolution == PixelResolution(
             dimensions=scales[idx][indexer], unit=units[indexer][0]