Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport PR #1743 on branch 0.11.x ((fix): lazy chunking respects -1) #1750

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/release-notes/1743.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix chunking with -1 in `chunks` argument of {func}`~anndata.experimental.read_elem_as_dask` {user}`ilan-gold`
14 changes: 11 additions & 3 deletions src/anndata/_io/specs/lazy_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,16 @@ def read_sparse_as_dask(
if chunks is not None:
if len(chunks) != 2:
raise ValueError("`chunks` must be a tuple of two integers")
if chunks[minor_dim] != shape[minor_dim]:
if chunks[minor_dim] not in {shape[minor_dim], -1, None}:
raise ValueError(
"Only the major axis can be chunked. "
f"Try setting chunks to {((-1, _DEFAULT_STRIDE) if is_csc else (_DEFAULT_STRIDE, -1))}"
)
stride = chunks[major_dim]
stride = (
chunks[major_dim]
if chunks[major_dim] not in {None, -1}
else shape[major_dim]
)

shape_minor, shape_major = shape if is_csc else shape[::-1]
chunks_major = compute_chunk_layout_for_axis_shape(stride, shape_major)
Expand Down Expand Up @@ -142,7 +146,11 @@ def read_h5_array(
shape = tuple(elem.shape)
dtype = elem.dtype
chunks: tuple[int, ...] = (
chunks if chunks is not None else (_DEFAULT_STRIDE,) * len(shape)
tuple(
c if c not in {None, -1} else s for c, s in zip(chunks, shape, strict=True)
)
if chunks is not None
else (_DEFAULT_STRIDE,) * len(shape)
)

chunk_layout = tuple(
Expand Down
6 changes: 6 additions & 0 deletions src/anndata/_io/specs/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,7 @@ def read_elem_as_dask(
Defaults to `(1000, adata.shape[1])` for CSR sparse,
`(adata.shape[0], 1000)` for CSC sparse,
and the on-disk chunking otherwise for dense.
Can use `-1` or `None` to indicate use of the size of the corresponding dimension.
Returns
-------
Expand Down Expand Up @@ -451,6 +452,11 @@ def read_elem_as_dask(
... g["X"], chunks=(500, adata.shape[1])
... )
>>> adata.layers["dense"] = ad.experimental.read_elem_as_dask(g["layers/dense"])
We also support using -1 and None as a chunk size to signify the reading the whole axis:
>>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, -1))
>>> adata.X = ad.experimental.read_elem_as_dask(g["X"], chunks=(500, None))
"""
return DaskReader(_LAZY_REGISTRY).read_elem(elem, chunks=chunks)

Expand Down
36 changes: 23 additions & 13 deletions tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,8 @@ def test_read_lazy_2d_dask(sparse_format, store):
(2, (200, 400)),
(1, None),
(2, None),
(2, (400, -1)),
(2, (400, None)),
],
)
def test_read_lazy_subsets_nd_dask(store, n_dims, chunks):
Expand Down Expand Up @@ -316,28 +318,36 @@ def test_read_lazy_h5_cluster(sparse_format, tmp_path):


@pytest.mark.parametrize(
("arr_type", "chunks"),
("arr_type", "chunks", "expected_chunksize"),
[
("dense", (100, 100)),
("csc", (SIZE, 10)),
("csr", (10, SIZE * 2)),
("csc", None),
("csr", None),
("dense", (100, 100), (100, 100)),
("csc", (SIZE, 10), (SIZE, 10)),
("csr", (10, SIZE * 2), (10, SIZE * 2)),
("csc", None, (SIZE, 1000)),
("csr", None, (1000, SIZE * 2)),
("csr", (10, -1), (10, SIZE * 2)),
("csc", (-1, 10), (SIZE, 10)),
("csr", (10, None), (10, SIZE * 2)),
("csc", (None, 10), (SIZE, 10)),
("csc", (None, None), (SIZE, SIZE * 2)),
("csr", (None, None), (SIZE, SIZE * 2)),
("csr", (-1, -1), (SIZE, SIZE * 2)),
("csc", (-1, -1), (SIZE, SIZE * 2)),
],
)
def test_read_lazy_2d_chunk_kwargs(store, arr_type, chunks):
def test_read_lazy_2d_chunk_kwargs(
store: H5Group | ZarrGroup,
arr_type: Literal["csr", "csc", "dense"],
chunks: None | tuple[int | None, int | None],
expected_chunksize: tuple[int, int],
):
if arr_type == "dense":
arr_store = create_dense_store(store)
X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks)
else:
arr_store = create_sparse_store(arr_type, store)
X_dask_from_disk = read_elem_as_dask(arr_store["X"], chunks=chunks)
if chunks is not None:
assert X_dask_from_disk.chunksize == chunks
else:
minor_index = int(arr_type == "csr")
# assert that sparse chunks are set correctly by default
assert X_dask_from_disk.chunksize[minor_index] == SIZE * (1 + minor_index)
assert X_dask_from_disk.chunksize == expected_chunksize
X_from_disk = read_elem(arr_store["X"])
assert_equal(X_from_disk, X_dask_from_disk)

Expand Down