diff --git a/docs/release-notes/1743.bugfix.md b/docs/release-notes/1743.bugfix.md new file mode 100644 index 000000000..f8f489aff --- /dev/null +++ b/docs/release-notes/1743.bugfix.md @@ -0,0 +1 @@ +Fix chunking with -1 in `chunks` argument of {func}`~anndata.experimental.read_elem_as_dask` {user}`ilan-gold` diff --git a/src/anndata/_io/specs/lazy_methods.py b/src/anndata/_io/specs/lazy_methods.py index 48770be9c..a34f627e7 100644 --- a/src/anndata/_io/specs/lazy_methods.py +++ b/src/anndata/_io/specs/lazy_methods.py @@ -105,12 +105,16 @@ def read_sparse_as_dask( if chunks is not None: if len(chunks) != 2: raise ValueError("`chunks` must be a tuple of two integers") - if chunks[minor_dim] != shape[minor_dim]: + if chunks[minor_dim] not in {shape[minor_dim], -1, None}: raise ValueError( "Only the major axis can be chunked. " f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}" ) - stride = chunks[major_dim] + stride = ( + chunks[major_dim] + if chunks[major_dim] not in {None, -1} + else shape[major_dim] + ) shape_minor, shape_major = shape if is_csc else shape[::-1] chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major) @@ -142,7 +146,11 @@ def read_h5_array( shape = tuple(elem.shape) dtype = elem.dtype chunks: tuple[int, ...] = ( - chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape) + tuple( + c if c not in {None, -1} else s for c, s in zip(chunks, shape, strict=True) + ) + if chunks is not None + else (_DEFAULT_STRIDE,) * len(shape) ) chunk_layout = tuple( diff --git a/src/anndata/_io/specs/registry.py b/src/anndata/_io/specs/registry.py index 3b43def7c..ca13f8e59 100644 --- a/src/anndata/_io/specs/registry.py +++ b/src/anndata/_io/specs/registry.py @@ -398,6 +398,7 @@ def read_elem_as_dask( Defaults to `(1000, adata.shape[1])` for CSR sparse, `(adata.shape[0], 1000)` for CSC sparse, and the on-disk chunking otherwise for dense. + Can use `-1` or `None` to indicate use of the size of the corresponding dimension. Returns ------- @@ -451,6 +452,11 @@ def read_elem_as_dask( ... g["X"], chunks=(500, adata.shape[1]) ... ) >>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"]) + + We also support using -1 and None as a chunk size to signify the reading the whole axis: + + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1)) + >>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, None)) """ return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks) diff --git a/tests/test_io_elementwise.py b/tests/test_io_elementwise.py index e46cd7d81..3ca5324b8 100644 --- a/tests/test_io_elementwise.py +++ b/tests/test_io_elementwise.py @@ -284,6 +284,8 @@ def test_read_lazy_2d_dask(sparse_format, store): (2, (200, 400)), (1, None), (2, None), + (2, (400, -1)), + (2, (400, None)), ], ) def test_read_lazy_subsets_nd_dask(store, n_dims, chunks): @@ -316,28 +318,36 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path): @pytest.mark.parametrize( - ("arr_type", "chunks"), + ("arr_type", "chunks", "expected_chunksize"), [ - ("dense", (100, 100)), - ("csc", (SIZE, 10)), - ("csr", (10, SIZE * 2)), - ("csc", None), - ("csr", None), + ("dense", (100, 100), (100, 100)), + ("csc", (SIZE, 10), (SIZE, 10)), + ("csr", (10, SIZE * 2), (10, SIZE * 2)), + ("csc", None, (SIZE, 1000)), + ("csr", None, (1000, SIZE * 2)), + ("csr", (10, -1), (10, SIZE * 2)), + ("csc", (-1, 10), (SIZE, 10)), + ("csr", (10, None), (10, SIZE * 2)), + ("csc", (None, 10), (SIZE, 10)), + ("csc", (None, None), (SIZE, SIZE * 2)), + ("csr", (None, None), (SIZE, SIZE * 2)), + ("csr", (-1, -1), (SIZE, SIZE * 2)), + ("csc", (-1, -1), (SIZE, SIZE * 2)), ], ) -def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks): +def test_read_lazy_2d_chunk_kwargs( + store: H5Group | ZarrGroup, + arr_type: Literal["csr", "csc", "dense"], + chunks: None | tuple[int | None, int | None], + expected_chunksize: tuple[int, int], +): if arr_type == "dense": arr_store = create_dense_store(store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) else: arr_store = create_sparse_store(arr_type, store) X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks) - if chunks is not None: - assert X_dask_from_disk.chunksize == chunks - else: - minor_index = int(arr_type == "csr") - # assert that sparse chunks are set correctly by default - assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index) + assert X_dask_from_disk.chunksize == expected_chunksize X_from_disk = read_elem(arr_store["X"]) assert_equal(X_from_disk, X_dask_from_disk)