Backport PR scverse#1743: (fix): lazy chunking respects -1

meeseeksmachine · Nov 8, 2024 · 56fa994 · 56fa994
1 parent 3aed0b6
commit 56fa994
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 16 deletions.
diff --git a/docs/release-notes/1743.bugfix.md b/docs/release-notes/1743.bugfix.md
@@ -0,0 +1 @@
+Fix chunking with -1 in `chunks` argument of {func}`~anndata.experimental.read_elem_as_dask` {user}`ilan-gold`
diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py
@@ -105,12 +105,16 @@ def read_sparse_as_dask(
     if chunks is not None:
         if len(chunks) != 2:
             raise ValueError("`chunks` must be a tuple of two integers")
-        if chunks[minor_dim] != shape[minor_dim]:
+        if chunks[minor_dim] not in {shape[minor_dim], -1, None}:
             raise ValueError(
                 "Only the major axis can be chunked. "
                 f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}"
             )
-        stride = chunks[major_dim]
+        stride = (
+            chunks[major_dim]
+            if chunks[major_dim] not in {None, -1}
+            else shape[major_dim]
+        )
 
     shape_minor, shape_major = shape if is_csc else shape[::-1]
     chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major)
@@ -142,7 +146,11 @@ def read_h5_array(
     shape = tuple(elem.shape)
     dtype = elem.dtype
     chunks: tuple[int, ...] = (
-        chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape)
+        tuple(
+            c if c not in {None, -1} else s for c, s in zip(chunks, shape, strict=True)
+        )
+        if chunks is not None
+        else (_DEFAULT_STRIDE,) * len(shape)
     )
 
     chunk_layout = tuple(

diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py
@@ -398,6 +398,7 @@ def read_elem_as_dask(
        Defaults to `(1000, adata.shape[1])` for CSR sparse,
        `(adata.shape[0], 1000)` for CSC sparse,
        and the on-disk chunking otherwise for dense.
+       Can use `-1` or `None` to indicate use of the size of the corresponding dimension.
 
     Returns
     -------
@@ -451,6 +452,11 @@ def read_elem_as_dask(
     ...     g["X"], chunks=(500, adata.shape[1])
     ... )
     >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"])
+
+    We also support using -1 and None as a chunk size to signify the reading the whole axis:
+
+    >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1))
+    >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, None))
     """
     return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks)
 

diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py
@@ -284,6 +284,8 @@ def test_read_lazy_2d_dask(sparse_format, store):
         (2, (200, 400)),
         (1, None),
         (2, None),
+        (2, (400, -1)),
+        (2, (400, None)),
     ],
 )
 def test_read_lazy_subsets_nd_dask(store, n_dims, chunks):
@@ -316,28 +318,36 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path):
 
 
 @pytest.mark.parametrize(
-    ("arr_type", "chunks"),
+    ("arr_type", "chunks", "expected_chunksize"),
     [
-        ("dense", (100, 100)),
-        ("csc", (SIZE, 10)),
-        ("csr", (10, SIZE * 2)),
-        ("csc", None),
-        ("csr", None),
+        ("dense", (100, 100), (100, 100)),
+        ("csc", (SIZE, 10), (SIZE, 10)),
+        ("csr", (10, SIZE * 2), (10, SIZE * 2)),
+        ("csc", None, (SIZE, 1000)),
+        ("csr", None, (1000, SIZE * 2)),
+        ("csr", (10, -1), (10, SIZE * 2)),
+        ("csc", (-1, 10), (SIZE, 10)),
+        ("csr", (10, None), (10, SIZE * 2)),
+        ("csc", (None, 10), (SIZE, 10)),
+        ("csc", (None, None), (SIZE, SIZE * 2)),
+        ("csr", (None, None), (SIZE, SIZE * 2)),
+        ("csr", (-1, -1), (SIZE, SIZE * 2)),
+        ("csc", (-1, -1), (SIZE, SIZE * 2)),
     ],
 )
-def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks):
+def test_read_lazy_2d_chunk_kwargs(
+    store: H5Group | ZarrGroup,
+    arr_type: Literal["csr", "csc", "dense"],
+    chunks: None | tuple[int | None, int | None],
+    expected_chunksize: tuple[int, int],
+):
     if arr_type == "dense":
         arr_store = create_dense_store(store)
         X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks)
     else:
         arr_store = create_sparse_store(arr_type, store)
         X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks)
-    if chunks is not None:
-        assert X_dask_from_disk.chunksize == chunks
-    else:
-        minor_index = int(arr_type == "csr")
-        # assert that sparse chunks are set correctly by default
-        assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index)
+    assert X_dask_from_disk.chunksize == expected_chunksize
     X_from_disk = read_elem(arr_store["X"])
     assert_equal(X_from_disk, X_dask_from_disk)