From b9f346c38d0063f369a074f930c6bfe88ae1cb16 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Tue, 13 Feb 2024 14:23:59 +0100 Subject: [PATCH] (fix): calculate mean slice correctly for sparse boolean->slice optimization (#1366) * (fix): calculate mean slice correctly * (chore): add mocker tests * (chore): release note * Update anndata/tests/test_backed_sparse.py Co-authored-by: Isaac Virshup * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * (chore): add docs * (lint): fixx import location --------- Co-authored-by: Isaac Virshup Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- anndata/_core/sparse_dataset.py | 2 +- anndata/tests/test_backed_sparse.py | 51 ++++++++++++++++++++++++----- docs/release-notes/0.10.6.md | 1 + pyproject.toml | 1 + 4 files changed, 45 insertions(+), 10 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 03514809b..6dfb89745 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -291,7 +291,7 @@ def subset_by_major_axis_mask( slices = np.ma.extras._ezclump(mask) def mean_slice_length(slices): - return floor((slices[-1].stop - slices[0].start) / len(slices)) + return floor(sum(s.stop - s.start for s in slices) / len(slices)) # heuristic for whether slicing should be optimized if len(slices) > 0: diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index 18656bea2..05efa747c 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -1,5 +1,6 @@ from __future__ import annotations +from functools import partial from typing import TYPE_CHECKING, Callable, Literal import h5py @@ -18,6 +19,7 @@ from pathlib import Path from numpy.typing import ArrayLike + from pytest_mock import MockerFixture subset_func2 = subset_func @@ -127,14 +129,18 @@ def make_randomized_mask(size: int) -> np.ndarray: return randomized_mask -# non-random indices, with alternating one false and n true -def make_alternating_mask(size: int) -> np.ndarray: +def make_alternating_mask(size: int, step: int) -> np.ndarray: mask_alternating = np.ones(size, dtype=bool) - for i in range(0, size, 10): # 10 is enough to trigger new behavior + for i in range(0, size, step): # 5 is too low to trigger new behavior mask_alternating[i] = False return mask_alternating +# non-random indices, with alternating one false and n true +make_alternating_mask_5 = partial(make_alternating_mask, step=5) +make_alternating_mask_10 = partial(make_alternating_mask, step=10) + + def make_one_group_mask(size: int) -> np.ndarray: one_group_mask = np.zeros(size, dtype=bool) one_group_mask[size // 4 : size // 2] = True @@ -149,26 +155,53 @@ def make_one_elem_mask(size: int) -> np.ndarray: # test behavior from https://github.com/scverse/anndata/pull/1233 @pytest.mark.parametrize( - "make_bool_mask", + "make_bool_mask,should_trigger_optimization", [ - make_randomized_mask, - make_alternating_mask, - make_one_group_mask, - make_one_elem_mask, + (make_randomized_mask, None), + (make_alternating_mask_10, True), + (make_alternating_mask_5, False), + (make_one_group_mask, True), + (make_one_elem_mask, False), ], - ids=["randomized", "alternating", "one_group", "one_elem"], + ids=["randomized", "alternating_10", "alternating_5", "one_group", "one_elem"], ) def test_consecutive_bool( + mocker: MockerFixture, ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], make_bool_mask: Callable[[int], np.ndarray], + should_trigger_optimization: bool | None, ): + """Tests for optimization from https://github.com/scverse/anndata/pull/1233 + + Parameters + ---------- + mocker + Mocker object + ondisk_equivalent_adata + AnnData objects with sparse X for testing + make_bool_mask + Function for creating a boolean mask. + should_trigger_optimization + Whether or not a given mask should trigger the optimized behavior. + """ _, csr_disk, csc_disk, _ = ondisk_equivalent_adata mask = make_bool_mask(csr_disk.shape[0]) # indexing needs to be on `X` directly to trigger the optimization. # `_normalize_indices`, which is used by `AnnData`, converts bools to ints with `np.where` + from anndata._core import sparse_dataset + + spy = mocker.spy(sparse_dataset, "get_compressed_vectors_for_slices") assert_equal(csr_disk.X[mask, :], csr_disk.X[np.where(mask)]) + if should_trigger_optimization is not None: + assert ( + spy.call_count == 1 if should_trigger_optimization else not spy.call_count + ) assert_equal(csc_disk.X[:, mask], csc_disk.X[:, np.where(mask)[0]]) + if should_trigger_optimization is not None: + assert ( + spy.call_count == 2 if should_trigger_optimization else not spy.call_count + ) @pytest.mark.parametrize( diff --git a/docs/release-notes/0.10.6.md b/docs/release-notes/0.10.6.md index 052523979..76495d20c 100644 --- a/docs/release-notes/0.10.6.md +++ b/docs/release-notes/0.10.6.md @@ -5,6 +5,7 @@ * Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold` * Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup` +* Fix mean slice length checking to use improved performance when indexing backed sparse matrices with boolean masks along their major axis {pr}`1366` {user}`ilan-gold` ```{rubric} Documentation ``` diff --git a/pyproject.toml b/pyproject.toml index 25d57bb85..663fb21b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,7 @@ test = [ "awkward>=2.3", "pyarrow", "pytest_memray", + "pytest-mock" ] gpu = ["cupy"]