From 1b7c1ed7c75e8f15b84f6214ecd71277ff3fae74 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 25 Jul 2023 21:24:10 +0000 Subject: [PATCH] Basic concatenation support + bedtime --- anndata/_core/merge.py | 159 +++++++++++++++++++++++++++--- anndata/tests/helpers.py | 41 +++++++- anndata/tests/test_concatenate.py | 30 +++++- anndata/tests/test_views.py | 29 +----- pyproject.toml | 1 + 5 files changed, 217 insertions(+), 43 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 95926da29..9f53ea3a6 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -29,7 +29,7 @@ from scipy.sparse import spmatrix from .anndata import AnnData -from ..compat import AwkArray, DaskArray +from ..compat import AwkArray, DaskArray, CupySparseMatrix, CupyArray from ..utils import asarray, dim_len from .index import _subset, make_slice from anndata._warnings import ExperimentalFeatureWarning @@ -132,24 +132,39 @@ def equal_array(a, b) -> bool: return equal(pd.DataFrame(a), pd.DataFrame(asarray(b))) +@equal.register(CupyArray) +def equal_cupyarray(a, b) -> bool: + import cupy as cp + + return bool(cp.array_equal(a, b, equal_nan=True)) + + @equal.register(pd.Series) def equal_series(a, b) -> bool: return a.equals(b) @equal.register(sparse.spmatrix) +@equal.register(CupySparseMatrix) def equal_sparse(a, b) -> bool: # It's a weird api, don't blame me - if isinstance(b, sparse.spmatrix): + import array_api_compat + + xp = array_api_compat.array_namespace(a.data) + + if isinstance(b, (CupySparseMatrix, sparse.spmatrix)): comp = a != b if isinstance(comp, bool): return not comp + if isinstance(comp, CupySparseMatrix): + # https://github.com/cupy/cupy/issues/7751 + comp = comp.get() # fmt: off return ( (len(comp.data) == 0) or ( - np.isnan(a[comp]).all() - and np.isnan(b[comp]).all() + xp.isnan(a[comp]).all() + and xp.isnan(b[comp]).all() ) ) # fmt: on @@ -171,6 +186,17 @@ def as_sparse(x): return x +def as_cp_sparse(x) -> CupySparseMatrix: + import cupyx.scipy.sparse as cpsparse + + if isinstance(x, cpsparse.spmatrix): + return x + elif isinstance(x, np.ndarray): + return cpsparse.csr_matrix(as_sparse(x)) + else: + return cpsparse.csr_matrix(x) + + def unify_dtypes(dfs: Iterable[pd.DataFrame]) -> list[pd.DataFrame]: """ Attempts to unify datatypes from multiple dataframes. @@ -268,6 +294,43 @@ def check_combinable_cols(cols: list[pd.Index], join: Literal["inner", "outer"]) ) +# TODO: open PR or feature request to cupy +def _cpblock_diag(mats, format=None, dtype=None): + """ + Modified version of scipy.sparse.block_diag for cupy sparse. + """ + import cupy as cp + from cupyx.scipy import sparse as cpsparse + + row = [] + col = [] + data = [] + r_idx = 0 + c_idx = 0 + for a in mats: + # if isinstance(a, (list, numbers.Number)): + # a = cpsparse.coo_matrix(a) + nrows, ncols = a.shape + if cpsparse.issparse(a): + a = a.tocoo() + row.append(a.row + r_idx) + col.append(a.col + c_idx) + data.append(a.data) + else: + a_row, a_col = cp.divmod(cp.arange(nrows * ncols), ncols) + row.append(a_row + r_idx) + col.append(a_col + c_idx) + data.append(a.reshape(-1)) + r_idx += nrows + c_idx += ncols + row = cp.concatenate(row) + col = cp.concatenate(col) + data = cp.concatenate(data) + return cpsparse.coo_matrix( + (data, (row, col)), shape=(r_idx, c_idx), dtype=dtype + ).asformat(format) + + ################### # Per element logic ################### @@ -430,6 +493,10 @@ def apply(self, el, *, axis, fill_value=None): return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) elif isinstance(el, DaskArray): return self._apply_to_dask_array(el, axis=axis, fill_value=fill_value) + elif isinstance(el, CupyArray): + return self._apply_to_cupy_array(el, axis=axis, fill_value=fill_value) + elif isinstance(el, CupySparseMatrix): + return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) @@ -457,6 +524,32 @@ def _apply_to_dask_array(self, el: DaskArray, *, axis, fill_value=None): return sub_el + def _apply_to_cupy_array(self, el, *, axis, fill_value=None): + import cupy as cp + + if fill_value is None: + fill_value = default_fill_value([el]) + if el.shape[axis] == 0: + # Presumably faster since it won't allocate the full array + shape = list(el.shape) + shape[axis] = len(self.new_idx) + return cp.broadcast_to(cp.asarray(fill_value), tuple(shape)) + + old_idx_tuple = [slice(None)] * len(el.shape) + old_idx_tuple[axis] = self.old_pos + old_idx_tuple = tuple(old_idx_tuple) + new_idx_tuple = [slice(None)] * len(el.shape) + new_idx_tuple[axis] = self.new_pos + new_idx_tuple = tuple(new_idx_tuple) + + out_shape = list(el.shape) + out_shape[axis] = len(self.new_idx) + + out = cp.full(tuple(out_shape), fill_value) + out[new_idx_tuple] = el[old_idx_tuple] + + return out + def _apply_to_array(self, el, *, axis, fill_value=None): if fill_value is None: fill_value = default_fill_value([el]) @@ -474,12 +567,20 @@ def _apply_to_array(self, el, *, axis, fill_value=None): ) def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: + if isinstance(el, CupySparseMatrix): + from cupyx.scipy import sparse + else: + from scipy import sparse + import array_api_compat + + xp = array_api_compat.array_namespace(el.data) + if fill_value is None: fill_value = default_fill_value([el]) if fill_value != 0: to_fill = self.new_idx.get_indexer(self.new_idx.difference(self.old_idx)) else: - to_fill = np.array([]) + to_fill = xp.array([]) # Fixing outer indexing for missing values if el.shape[axis] == 0: @@ -489,18 +590,21 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: if fill_value == 0: return sparse.csr_matrix(shape) else: - return np.broadcast_to(fill_value, shape) + return type(el)(xp.broadcast_to(xp.asarray(fill_value), shape)) fill_idxer = None - if len(to_fill) > 0: - idxmtx_dtype = np.promote_types(el.dtype, np.array(fill_value).dtype) + if len(to_fill) > 0 or isinstance(el, CupySparseMatrix): + idxmtx_dtype = xp.promote_types(el.dtype, xp.array(fill_value).dtype) else: idxmtx_dtype = bool if axis == 1: idxmtx = sparse.coo_matrix( - (np.ones(len(self.new_pos), dtype=bool), (self.old_pos, self.new_pos)), + ( + xp.ones(len(self.new_pos), dtype=idxmtx_dtype), + (xp.asarray(self.old_pos), xp.asarray(self.new_pos)), + ), shape=(len(self.old_idx), len(self.new_idx)), dtype=idxmtx_dtype, ) @@ -511,7 +615,10 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: fill_idxer = (slice(None), to_fill) elif axis == 0: idxmtx = sparse.coo_matrix( - (np.ones(len(self.new_pos), dtype=bool), (self.new_pos, self.old_pos)), + ( + xp.ones(len(self.new_pos), dtype=idxmtx_dtype), + (xp.asarray(self.new_pos), xp.asarray(self.old_pos)), + ), shape=(len(self.new_idx), len(self.old_idx)), dtype=idxmtx_dtype, ) @@ -626,6 +733,31 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ) return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)], axis=axis) + elif any(isinstance(a, CupySparseMatrix) for a in arrays): + import cupyx.scipy.sparse as cpsparse + + sparse_stack = (cpsparse.vstack, cpsparse.hstack)[axis] + return sparse_stack( + [ + f(as_cp_sparse(a), axis=1 - axis, fill_value=fill_value) + for f, a in zip(reindexers, arrays) + ], + format="csr", + ) + elif any(isinstance(a, CupyArray) for a in arrays): + import cupy as cp + + if not all(isinstance(a, CupyArray) or 0 in a.shape for a in arrays): + raise NotImplementedError( + "Cannot concatenate a cupy array with other array types." + ) + return cp.concatenate( + [ + f(cp.asarray(x), fill_value=fill_value, axis=1 - axis) + for f, x in zip(reindexers, arrays) + ], + axis=axis, + ) elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( @@ -774,7 +906,12 @@ def concat_pairwise_mapping( m.get(k, sparse.csr_matrix((s, s), dtype=bool)) for m, s in zip(mappings, shapes) ] - result[k] = sparse.block_diag(els, format="csr") + if all(isinstance(el, (CupySparseMatrix, CupyArray)) for el in els): + from cupyx.scipy import sparse as cpsparse + + result[k] = _cpblock_diag(els, format="csr") + else: + result[k] = sparse.block_diag(els, format="csr") return result diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 47a0d8899..c08309114 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -17,7 +17,14 @@ from anndata._core.sparse_dataset import SparseDataset from anndata._core.aligned_mapping import AlignedMapping from anndata.utils import asarray -from anndata.compat import AwkArray, DaskArray, CupySparseMatrix, CupyArray +from anndata.compat import ( + AwkArray, + DaskArray, + CupySparseMatrix, + CupyArray, + CupyCSCMatrix, + CupyCSRMatrix, +) # Give this to gen_adata when dask array support is expected. GEN_ADATA_DASK_ARGS = dict( @@ -592,3 +599,35 @@ def as_dense_dask_array(a): @as_dense_dask_array.register(sparse.spmatrix) def _(a): return as_dense_dask_array(a.toarray()) + + +def as_cupy_type(val, typ): + """ + Rough conversion function + """ + if issubclass(typ, CupyArray): + import cupy as cp + + if isinstance(val, sparse.spmatrix): + val = val.toarray() + return cp.array(val) + elif issubclass(typ, CupyCSRMatrix): + import cupyx.scipy.sparse as cpsparse + import cupy as cp + + if isinstance(val, np.ndarray): + return cpsparse.csr_matrix(cp.array(val)) + else: + return cpsparse.csr_matrix(val) + elif issubclass(typ, CupyCSCMatrix): + import cupyx.scipy.sparse as cpsparse + import cupy as cp + + if isinstance(val, np.ndarray): + return cpsparse.csr_matrix(cp.array(val)) + else: + return cpsparse.csr_matrix(val) + else: + raise NotImplementedError( + f"Conversion from {type(val)} to {typ} not implemented" + ) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 275c6fb11..f53ae0117 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -21,11 +21,12 @@ from anndata.tests.helpers import ( assert_equal, as_dense_dask_array, + as_cupy_type, gen_adata, GEN_ADATA_DASK_ARGS, ) from anndata.utils import asarray -from anndata.compat import DaskArray, AwkArray +from anndata.compat import DaskArray, AwkArray, CupyArray, CupyCSRMatrix, CupyCSCMatrix @singledispatch @@ -72,11 +73,34 @@ def make_idx_tuple(idx, axis): return tuple(tup) +# TODO: copied from test_views, consider deduplication +_matrix_casting_params = [ + pytest.param(asarray, id="np_array"), + pytest.param(sparse.csr_matrix, id="scipy_csr"), + pytest.param(sparse.csc_matrix, id="scipy_csc"), + pytest.param(as_dense_dask_array, id="dask_array"), +] +_cupy_casting_params = [ + pytest.param( + partial(as_cupy_type, typ=CupyArray), id="cupy_array", marks=pytest.mark.gpu + ), + pytest.param( + partial(as_cupy_type, typ=CupyCSRMatrix), + id="cupy_csr", + marks=pytest.mark.gpu, + ), + pytest.param( + partial(as_cupy_type, typ=CupyCSCMatrix), + id="cupy_csc", + marks=pytest.mark.gpu, + ), +] + + # Will call func(sparse_matrix) so these types should be sparse compatible # See array_type if only dense arrays are expected as input. @pytest.fixture( - params=[asarray, sparse.csr_matrix, sparse.csc_matrix, as_dense_dask_array], - ids=["np_array", "scipy_csr", "scipy_csc", "dask_array"], + params=_matrix_casting_params + _cupy_casting_params, ) def array_type(request): return request.param diff --git a/anndata/tests/test_views.py b/anndata/tests/test_views.py index d90845a7f..1e8668453 100644 --- a/anndata/tests/test_views.py +++ b/anndata/tests/test_views.py @@ -21,6 +21,7 @@ single_subset, assert_equal, as_dense_dask_array, + as_cupy_type, GEN_ADATA_DASK_ARGS, ) @@ -72,34 +73,6 @@ def adata_parameterized(request): return gen_adata(shape=(200, 300), X_type=request.param) -def as_cupy_type(val, typ): - """ - Rough conversion function - """ - if issubclass(typ, CupyArray): - import cupy as cp - - return cp.array(val) - elif issubclass(typ, CupyCSRMatrix): - import cupyx.scipy.sparse as cpsparse - import cupy as cp - - if isinstance(val, np.ndarray): - return cpsparse.csr_matrix(cp.array(val)) - else: - return cpsparse.csr_matrix(val) - elif issubclass(typ, CupyCSCMatrix): - import cupyx.scipy.sparse as cpsparse - import cupy as cp - - if isinstance(val, np.ndarray): - return cpsparse.csr_matrix(cp.array(val)) - else: - return cpsparse.csr_matrix(val) - else: - raise NotImplementedError(f"Conversion to {typ} not implemented") - - _matrix_casting_params = [ pytest.param(np.array, id="np_array"), pytest.param(sparse.csr_matrix, id="scipy_csr"), diff --git a/pyproject.toml b/pyproject.toml index 5d8967aca..9690b19fb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,6 +43,7 @@ dependencies = [ "h5py>=3", "natsort", "packaging>=20", + "array_api_compat", ] dynamic = ["version"]