Skip to content

Commit

Permalink
(fix): calculate mean slice correctly for sparse boolean->slice optim…
Browse files Browse the repository at this point in the history
…ization (#1366)

* (fix): calculate mean slice correctly

* (chore): add mocker tests

* (chore): release note

* Update anndata/tests/test_backed_sparse.py

Co-authored-by: Isaac Virshup <[email protected]>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* (chore): add docs

* (lint): fixx import location

---------

Co-authored-by: Isaac Virshup <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Feb 13, 2024
1 parent 3014607 commit b9f346c
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 10 deletions.
2 changes: 1 addition & 1 deletion anndata/_core/sparse_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def subset_by_major_axis_mask(
slices = np.ma.extras._ezclump(mask)

def mean_slice_length(slices):
return floor((slices[-1].stop - slices[0].start) / len(slices))
return floor(sum(s.stop - s.start for s in slices) / len(slices))

# heuristic for whether slicing should be optimized
if len(slices) > 0:
Expand Down
51 changes: 42 additions & 9 deletions anndata/tests/test_backed_sparse.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from functools import partial
from typing import TYPE_CHECKING, Callable, Literal

import h5py
Expand All @@ -18,6 +19,7 @@
from pathlib import Path

from numpy.typing import ArrayLike
from pytest_mock import MockerFixture

subset_func2 = subset_func

Expand Down Expand Up @@ -127,14 +129,18 @@ def make_randomized_mask(size: int) -> np.ndarray:
return randomized_mask


# non-random indices, with alternating one false and n true
def make_alternating_mask(size: int) -> np.ndarray:
def make_alternating_mask(size: int, step: int) -> np.ndarray:
mask_alternating = np.ones(size, dtype=bool)
for i in range(0, size, 10): # 10 is enough to trigger new behavior
for i in range(0, size, step): # 5 is too low to trigger new behavior
mask_alternating[i] = False
return mask_alternating


# non-random indices, with alternating one false and n true
make_alternating_mask_5 = partial(make_alternating_mask, step=5)
make_alternating_mask_10 = partial(make_alternating_mask, step=10)


def make_one_group_mask(size: int) -> np.ndarray:
one_group_mask = np.zeros(size, dtype=bool)
one_group_mask[size // 4 : size // 2] = True
Expand All @@ -149,26 +155,53 @@ def make_one_elem_mask(size: int) -> np.ndarray:

# test behavior from https://github.com/scverse/anndata/pull/1233
@pytest.mark.parametrize(
"make_bool_mask",
"make_bool_mask,should_trigger_optimization",
[
make_randomized_mask,
make_alternating_mask,
make_one_group_mask,
make_one_elem_mask,
(make_randomized_mask, None),
(make_alternating_mask_10, True),
(make_alternating_mask_5, False),
(make_one_group_mask, True),
(make_one_elem_mask, False),
],
ids=["randomized", "alternating", "one_group", "one_elem"],
ids=["randomized", "alternating_10", "alternating_5", "one_group", "one_elem"],
)
def test_consecutive_bool(
mocker: MockerFixture,
ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
make_bool_mask: Callable[[int], np.ndarray],
should_trigger_optimization: bool | None,
):
"""Tests for optimization from https://github.com/scverse/anndata/pull/1233
Parameters
----------
mocker
Mocker object
ondisk_equivalent_adata
AnnData objects with sparse X for testing
make_bool_mask
Function for creating a boolean mask.
should_trigger_optimization
Whether or not a given mask should trigger the optimized behavior.
"""
_, csr_disk, csc_disk, _ = ondisk_equivalent_adata
mask = make_bool_mask(csr_disk.shape[0])

# indexing needs to be on `X` directly to trigger the optimization.
# `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where`
from anndata._core import sparse_dataset

spy = mocker.spy(sparse_dataset, "get_compressed_vectors_for_slices")
assert_equal(csr_disk.X[mask, :], csr_disk.X[np.where(mask)])
if should_trigger_optimization is not None:
assert (
spy.call_count == 1 if should_trigger_optimization else not spy.call_count
)
assert_equal(csc_disk.X[:, mask], csc_disk.X[:, np.where(mask)[0]])
if should_trigger_optimization is not None:
assert (
spy.call_count == 2 if should_trigger_optimization else not spy.call_count
)


@pytest.mark.parametrize(
Expand Down
1 change: 1 addition & 0 deletions docs/release-notes/0.10.6.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

* Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold`
* Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup`
* Fix mean slice length checking to use improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1366` {user}`ilan-gold`

```{rubric} Documentation
```
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,7 @@ test = [
"awkward>=2.3",
"pyarrow",
"pytest_memray",
"pytest-mock"
]
gpu = ["cupy"]

Expand Down

0 comments on commit b9f346c

Please sign in to comment.