(fix): calculate mean slice correctly for sparse boolean->slice optim…

…ization (#1366) * (fix): calculate mean slice correctly * (chore): add mocker tests * (chore): release note * Update anndata/tests/test_backed_sparse.py Co-authored-by: Isaac Virshup <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * (chore): add docs * (lint): fixx import location --------- Co-authored-by: Isaac Virshup <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
scverse · Feb 13, 2024 · b9f346c · b9f346c
1 parent 3014607
commit b9f346c
Show file tree

Hide file tree

Showing 4 changed files with 45 additions and 10 deletions.
diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py
@@ -291,7 +291,7 @@ def subset_by_major_axis_mask(
     slices = np.ma.extras._ezclump(mask)
 
     def mean_slice_length(slices):
-        return floor((slices[-1].stop - slices[0].start) / len(slices))
+        return floor(sum(s.stop - s.start for s in slices) / len(slices))
 
     # heuristic for whether slicing should be optimized
     if len(slices) > 0:

diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+from functools import partial
 from typing import TYPE_CHECKING, Callable, Literal
 
 import h5py
@@ -18,6 +19,7 @@
     from pathlib import Path
 
     from numpy.typing import ArrayLike
+    from pytest_mock import MockerFixture
 
 subset_func2 = subset_func
 
@@ -127,14 +129,18 @@ def make_randomized_mask(size: int) -> np.ndarray:
     return randomized_mask
 
 
-# non-random indices, with alternating one false and n true
-def make_alternating_mask(size: int) -> np.ndarray:
+def make_alternating_mask(size: int, step: int) -> np.ndarray:
     mask_alternating = np.ones(size, dtype=bool)
-    for i in range(0, size, 10):  # 10 is enough to trigger new behavior
+    for i in range(0, size, step):  # 5 is too low to trigger new behavior
         mask_alternating[i] = False
     return mask_alternating
 
 
+# non-random indices, with alternating one false and n true
+make_alternating_mask_5 = partial(make_alternating_mask, step=5)
+make_alternating_mask_10 = partial(make_alternating_mask, step=10)
+
+
 def make_one_group_mask(size: int) -> np.ndarray:
     one_group_mask = np.zeros(size, dtype=bool)
     one_group_mask[size // 4 : size // 2] = True
@@ -149,26 +155,53 @@ def make_one_elem_mask(size: int) -> np.ndarray:
 
 # test behavior from https://github.com/scverse/anndata/pull/1233
 @pytest.mark.parametrize(
-    "make_bool_mask",
+    "make_bool_mask,should_trigger_optimization",
     [
-        make_randomized_mask,
-        make_alternating_mask,
-        make_one_group_mask,
-        make_one_elem_mask,
+        (make_randomized_mask, None),
+        (make_alternating_mask_10, True),
+        (make_alternating_mask_5, False),
+        (make_one_group_mask, True),
+        (make_one_elem_mask, False),
     ],
-    ids=["randomized", "alternating", "one_group", "one_elem"],
+    ids=["randomized", "alternating_10", "alternating_5", "one_group", "one_elem"],
 )
 def test_consecutive_bool(
+    mocker: MockerFixture,
     ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData],
     make_bool_mask: Callable[[int], np.ndarray],
+    should_trigger_optimization: bool | None,
 ):
+    """Tests for optimization from https://github.com/scverse/anndata/pull/1233
+
+    Parameters
+    ----------
+    mocker
+        Mocker object
+    ondisk_equivalent_adata
+        AnnData objects with sparse X for testing
+    make_bool_mask
+        Function for creating a boolean mask.
+    should_trigger_optimization
+        Whether or not a given mask should trigger the optimized behavior.
+    """
     _, csr_disk, csc_disk, _ = ondisk_equivalent_adata
     mask = make_bool_mask(csr_disk.shape[0])
 
     # indexing needs to be on `X` directly to trigger the optimization.
     # `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where`
+    from anndata._core import sparse_dataset
+
+    spy = mocker.spy(sparse_dataset, "get_compressed_vectors_for_slices")
     assert_equal(csr_disk.X[mask, :], csr_disk.X[np.where(mask)])
+    if should_trigger_optimization is not None:
+        assert (
+            spy.call_count == 1 if should_trigger_optimization else not spy.call_count
+        )
     assert_equal(csc_disk.X[:, mask], csc_disk.X[:, np.where(mask)[0]])
+    if should_trigger_optimization is not None:
+        assert (
+            spy.call_count == 2 if should_trigger_optimization else not spy.call_count
+        )
 
 
 @pytest.mark.parametrize(

diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md
@@ -5,6 +5,7 @@
 
 * Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold`
 * Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup`
+* Fix mean slice length checking to use improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1366` {user}`ilan-gold`
 
 ```{rubric} Documentation
 ```

diff --git a/pyproject.toml b/pyproject.toml
@@ -95,6 +95,7 @@ test = [
     "awkward>=2.3",
     "pyarrow",
     "pytest_memray",
+    "pytest-mock"
 ]
 gpu = ["cupy"]