From 2f7357695ae91a00dd91d0c5f9451574678d7289 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 29 Apr 2022 17:38:40 +0200 Subject: [PATCH 001/125] Start backed sparse support for zarr --- anndata/_core/anndata.py | 6 +- anndata/_core/file_backing.py | 14 +- anndata/_core/raw.py | 6 +- anndata/_core/sparse_dataset.py | 248 +++++++++++++++--- anndata/_io/h5ad.py | 10 +- anndata/_io/specs/methods.py | 18 +- anndata/_io/utils.py | 4 +- .../multi_files/_anncollection.py | 4 +- anndata/tests/helpers.py | 4 +- anndata/tests/test_backed_sparse.py | 44 ++-- anndata/utils.py | 4 +- 11 files changed, 278 insertions(+), 84 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 1ccb1a03f..928187b7a 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -43,7 +43,7 @@ as_view, _resolve_idxs, ) -from .sparse_dataset import SparseDataset +from .sparse_dataset import sparse_dataset from .. import utils from ..utils import convert_to_dict, ensure_df_homogeneous from ..logging import anndata_logger as logger @@ -610,7 +610,7 @@ def X(self) -> Optional[Union[np.ndarray, sparse.spmatrix, ArrayView]]: self.file.open() X = self.file["X"] if isinstance(X, h5py.Group): - X = SparseDataset(X) + X = sparse_dataset(X) # This is so that we can index into a backed dense dataset with # indices that aren’t strictly increasing if self.is_view: @@ -675,7 +675,7 @@ def X(self, value: Optional[Union[np.ndarray, sparse.spmatrix]]): if self.is_view: X = self.file["X"] if isinstance(X, h5py.Group): - X = SparseDataset(X) + X = sparse_dataset(X) X[oidx, vidx] = value else: self._set_backed("X", value) diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py index 3e48beccb..86bd6a0a5 100644 --- a/anndata/_core/file_backing.py +++ b/anndata/_core/file_backing.py @@ -6,7 +6,7 @@ import h5py from . import anndata -from .sparse_dataset import SparseDataset +from .sparse_dataset import BaseCompressedSparseDataset from ..compat import Literal, ZarrArray @@ -38,11 +38,15 @@ def __contains__(self, x) -> bool: def __iter__(self) -> Iterator[str]: return iter(self._file) - def __getitem__(self, key: str) -> Union[h5py.Group, h5py.Dataset, SparseDataset]: + def __getitem__( + self, key: str + ) -> Union[h5py.Group, h5py.Dataset, BaseCompressedSparseDataset]: return self._file[key] def __setitem__( - self, key: str, value: Union[h5py.Group, h5py.Dataset, SparseDataset] + self, + key: str, + value: Union[h5py.Group, h5py.Dataset, BaseCompressedSparseDataset], ): self._file[key] = value @@ -106,6 +110,6 @@ def _(x): return x[...] -@to_memory.register(SparseDataset) -def _(x: SparseDataset): +@to_memory.register(BaseCompressedSparseDataset) +def _(x: BaseCompressedSparseDataset): return x.to_memory() diff --git a/anndata/_core/raw.py b/anndata/_core/raw.py index 39cee1c4e..a8b2ecc18 100644 --- a/anndata/_core/raw.py +++ b/anndata/_core/raw.py @@ -9,7 +9,7 @@ from . import anndata from .index import _normalize_index, _subset, unpack_index, get_vector from .aligned_mapping import AxisArrays, AxisArraysView -from .sparse_dataset import SparseDataset +from .sparse_dataset import BaseCompressedSparseDataset, sparse_dataset # TODO: Implement views for Raw @@ -43,7 +43,7 @@ def _get_X(self, layer=None): return self.X @property - def X(self) -> Union[SparseDataset, np.ndarray, sparse.spmatrix]: + def X(self) -> Union[BaseCompressedSparseDataset, np.ndarray, sparse.spmatrix]: # TODO: Handle unsorted array of integer indices for h5py.Datasets if not self._adata.isbacked: return self._X @@ -60,7 +60,7 @@ def X(self) -> Union[SparseDataset, np.ndarray, sparse.spmatrix]: f"{self._adata.file.filename}." ) if isinstance(X, h5py.Group): - X = SparseDataset(X) + X = sparse_dataset(X) # Check if we need to subset if self._adata.is_view: # TODO: As noted above, implement views of raw diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 8654e8dd7..7e49484f1 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -10,6 +10,7 @@ # TODO: # - think about supporting the COO format +from abc import ABC import collections.abc as cabc from itertools import accumulate, chain from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type @@ -47,7 +48,7 @@ class BackedSparseMatrix(_cs_matrix): def copy(self) -> ss.spmatrix: if isinstance(self.data, h5py.Dataset): - return SparseDataset(self.data.parent).to_memory() + return sparse_dataset(self.data.parent).to_memory() else: return super().copy() @@ -227,45 +228,45 @@ def get_backed_class(format_str: str) -> Type[BackedSparseMatrix]: raise ValueError(f"Format string {format_str} is not supported.") -class SparseDataset: - """Analogous to :class:`h5py.Dataset `, but for sparse matrices.""" +def _get_group_format(group) -> str: + if "h5sparse_format" in group.attrs: + # TODO: Warn about an old format + # If this is only just going to be public, I could insist it's not like this + return _read_attr(group.attrs, "h5sparse_format") + else: + # Should this be an extra field? + return _read_attr(group.attrs, "encoding-type").replace("_matrix", "") + + +class BaseCompressedSparseDataset(ABC): + """ + Analogous to :class:`h5py.Dataset ` or `zarr.Array`, but for sparse matrices. + """ def __init__(self, group: h5py.Group): + type(self)._check_group_format(group) self.group = group @property def dtype(self) -> np.dtype: return self.group["data"].dtype - @property - def format_str(self) -> str: - if "h5sparse_format" in self.group.attrs: - return _read_attr(self.group.attrs, "h5sparse_format") - else: - # Should this be an extra field? - return _read_attr(self.group.attrs, "encoding-type").replace("_matrix", "") - - @property - def h5py_group(self) -> h5py.Group: - warn( - "Attribute `h5py_group` of SparseDatasets is deprecated. " - "Use `group` instead.", - DeprecationWarning, - ) - return self.group + @classmethod + def _check_group_format(cls, group): + group_format = _get_group_format(group) + assert group_format == cls.format_str @property def name(self) -> str: return self.group.name - @property - def file(self) -> h5py.File: - return self.group.file - @property def shape(self) -> Tuple[int, int]: - shape = self.group.attrs.get("h5sparse_shape") - return tuple(self.group.attrs["shape"] if shape is None else shape) + shape = _read_attr(self.group.attrs, "shape", None) + if shape is None: + # TODO warn + shape = self.group.attrs.get("h5sparse_shape") + return tuple(shape) @property def value(self) -> ss.spmatrix: @@ -273,14 +274,14 @@ def value(self) -> ss.spmatrix: def __repr__(self) -> str: return ( - f"' ) def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: row, col = self._normalize_index(index) - mtx = self.to_backed() + mtx = self._to_backed() sub = mtx[row, col] # If indexing is array x array it returns a backed_sparse_matrix # Not sure what the performance is on that operation @@ -289,11 +290,6 @@ def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatri else: return sub - def __setitem__(self, index: Union[Index, Tuple[()]], value): - row, col = self._normalize_index(index) - mock_matrix = self.to_backed() - mock_matrix[row, col] = value - def _normalize_index( self, index: Union[Index, Tuple[()]] ) -> Tuple[np.ndarray, np.ndarray]: @@ -304,11 +300,18 @@ def _normalize_index( row, col = np.ix_(row, col) return row, col + # def __setitem__(self, index: Union[Index, Tuple[()]], value): + + # row, col = self._normalize_index(index) + # mock_matrix = self._to_backed() + # mock_matrix[row, col] = value + + # TODO: split to other classes? def append(self, sparse_matrix: ss.spmatrix): # Prep variables shape = self.shape - if isinstance(sparse_matrix, SparseDataset): - sparse_matrix = sparse_matrix.to_backed() + if isinstance(sparse_matrix, BaseCompressedSparseDataset): + sparse_matrix = sparse_matrix._to_backed() # Check input if not ss.isspmatrix(sparse_matrix): @@ -365,7 +368,7 @@ def append(self, sparse_matrix: ss.spmatrix): indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) indices[orig_data_size:] = sparse_matrix.indices - def to_backed(self) -> BackedSparseMatrix: + def _to_backed(self) -> BackedSparseMatrix: format_class = get_backed_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"] @@ -382,6 +385,179 @@ def to_memory(self) -> ss.spmatrix: return mtx -@_subset.register(SparseDataset) +class CSRDataset(BaseCompressedSparseDataset): + format_str = "csr" + + +class CSCDataset(BaseCompressedSparseDataset): + format_str = "csc" + + +# class SparseDataset: +# """Analogous to :class:`h5py.Dataset `, but for sparse matrices.""" + +# def __init__(self, group: h5py.Group): +# self.group = group + +# @property +# def dtype(self) -> np.dtype: +# return self.group["data"].dtype + +# @property +# def format_str(self) -> str: +# if "h5sparse_format" in self.group.attrs: +# return _read_attr(self.group.attrs, "h5sparse_format") +# else: +# # Should this be an extra field? +# return _read_attr(self.group.attrs, "encoding-type").replace("_matrix", "") + +# @property +# def h5py_group(self) -> h5py.Group: +# warn( +# "Attribute `h5py_group` of SparseDatasets is deprecated. " +# "Use `group` instead.", +# DeprecationWarning, +# ) +# return self.group + +# @property +# def name(self) -> str: +# return self.group.name + +# @property +# def file(self) -> h5py.File: +# return self.group.file + +# @property +# def shape(self) -> Tuple[int, int]: +# shape = self.group.attrs.get("h5sparse_shape") +# return tuple(self.group.attrs["shape"] if shape is None else shape) + +# @property +# def value(self) -> ss.spmatrix: +# return self.to_memory() + +# def __repr__(self) -> str: +# return ( +# f"' +# ) + +# def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: +# row, col = self._normalize_index(index) +# mtx = self.to_backed() +# sub = mtx[row, col] +# # If indexing is array x array it returns a backed_sparse_matrix +# # Not sure what the performance is on that operation +# if isinstance(sub, BackedSparseMatrix): +# return get_memory_class(self.format_str)(sub) +# else: +# return sub + +# def __setitem__(self, index: Union[Index, Tuple[()]], value): +# row, col = self._normalize_index(index) +# mock_matrix = self.to_backed() +# mock_matrix[row, col] = value + +# def _normalize_index( +# self, index: Union[Index, Tuple[()]] +# ) -> Tuple[np.ndarray, np.ndarray]: +# if index == (): +# index = slice(None) +# row, col = unpack_index(index) +# if all(isinstance(x, cabc.Iterable) for x in (row, col)): +# row, col = np.ix_(row, col) +# return row, col + +# def append(self, sparse_matrix: ss.spmatrix): +# # Prep variables +# shape = self.shape +# if isinstance(sparse_matrix, SparseDataset): +# sparse_matrix = sparse_matrix.to_backed() + +# # Check input +# if not ss.isspmatrix(sparse_matrix): +# raise NotImplementedError( +# "Currently, only sparse matrices of equivalent format can be " +# "appended to a SparseDataset." +# ) +# if self.format_str not in {"csr", "csc"}: +# raise NotImplementedError( +# f"The append method for format {self.format_str} " +# f"is not implemented." +# ) +# if self.format_str != get_format_str(sparse_matrix): +# raise ValueError( +# f"Matrices must have same format. Currently are " +# f"{self.format_str!r} and {get_format_str(sparse_matrix)!r}" +# ) + +# # shape +# if self.format_str == "csr": +# assert ( +# shape[1] == sparse_matrix.shape[1] +# ), "CSR matrices must have same size of dimension 1 to be appended." +# new_shape = (shape[0] + sparse_matrix.shape[0], shape[1]) +# elif self.format_str == "csc": +# assert ( +# shape[0] == sparse_matrix.shape[0] +# ), "CSC matrices must have same size of dimension 0 to be appended." +# new_shape = (shape[0], shape[1] + sparse_matrix.shape[1]) +# else: +# assert False, "We forgot to update this branching to a new format" +# if "h5sparse_shape" in self.group.attrs: +# del self.group.attrs["h5sparse_shape"] +# self.group.attrs["shape"] = new_shape + +# # data +# data = self.group["data"] +# orig_data_size = data.shape[0] +# data.resize((orig_data_size + sparse_matrix.data.shape[0],)) +# data[orig_data_size:] = sparse_matrix.data + +# # indptr +# indptr = self.group["indptr"] +# orig_data_size = indptr.shape[0] +# append_offset = indptr[-1] +# indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,)) +# indptr[orig_data_size:] = ( +# sparse_matrix.indptr[1:].astype(np.int64) + append_offset +# ) + +# # indices +# indices = self.group["indices"] +# orig_data_size = indices.shape[0] +# indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) +# indices[orig_data_size:] = sparse_matrix.indices + +# def _to_backed(self) -> BackedSparseMatrix: +# format_class = get_backed_class(self.format_str) +# mtx = format_class(self.shape, dtype=self.dtype) +# mtx.data = self.group["data"] +# mtx.indices = self.group["indices"] +# mtx.indptr = self.group["indptr"][:] +# return mtx + +# def to_memory(self) -> ss.spmatrix: +# format_class = get_memory_class(self.format_str) +# mtx = format_class(self.shape, dtype=self.dtype) +# mtx.data = self.group["data"][...] +# mtx.indices = self.group["indices"][...] +# mtx.indptr = self.group["indptr"][...] +# return mtx + + +def sparse_dataset(group) -> BaseCompressedSparseDataset: + + # encoding_type = _read_attr(group, "encoding-type") + encoding_type = _get_group_format(group) + if encoding_type == "csr": + return CSRDataset(group) + elif encoding_type == "csc": + return CSCDataset(group) + + +@_subset.register(BaseCompressedSparseDataset) def subset_sparsedataset(d, subset_idx): return d[subset_idx] diff --git a/anndata/_io/h5ad.py b/anndata/_io/h5ad.py index d3a9c4ba6..d5f41aa88 100644 --- a/anndata/_io/h5ad.py +++ b/anndata/_io/h5ad.py @@ -11,7 +11,7 @@ import pandas as pd from scipy import sparse -from .._core.sparse_dataset import SparseDataset +from .._core.sparse_dataset import BaseCompressedSparseDataset from .._core.file_backing import AnnDataFileManager from .._core.anndata import AnnData from ..compat import ( @@ -78,13 +78,15 @@ def write_h5ad( f.attrs.setdefault("encoding-type", "anndata") f.attrs.setdefault("encoding-version", "0.1.0") - if "X" in as_dense and isinstance(adata.X, (sparse.spmatrix, SparseDataset)): + if "X" in as_dense and isinstance( + adata.X, (sparse.spmatrix, BaseCompressedSparseDataset) + ): write_sparse_as_dense(f, "X", adata.X, dataset_kwargs=dataset_kwargs) elif not (adata.isbacked and Path(adata.filename) == Path(filepath)): # If adata.isbacked, X should already be up to date write_elem(f, "X", adata.X, dataset_kwargs=dataset_kwargs) if "raw/X" in as_dense and isinstance( - adata.raw.X, (sparse.spmatrix, SparseDataset) + adata.raw.X, (sparse.spmatrix, BaseCompressedSparseDataset) ): write_sparse_as_dense( f, "raw/X", adata.raw.X, dataset_kwargs=dataset_kwargs @@ -110,7 +112,7 @@ def write_sparse_as_dense(f, key, value, dataset_kwargs=MappingProxyType({})): real_key = None # Flag for if temporary key was used if key in f: if ( - isinstance(value, (h5py.Group, h5py.Dataset, SparseDataset)) + isinstance(value, (h5py.Group, h5py.Dataset, BaseCompressedSparseDataset)) and value.file.filename == f.file.filename ): # Write to temporary key before overwriting real_key = key diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 71c216a2a..37ccf66d8 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -16,7 +16,7 @@ from anndata import AnnData, Raw from anndata._core.index import _normalize_indices from anndata._core.merge import intersect_keys -from anndata._core.sparse_dataset import SparseDataset +from anndata._core.sparse_dataset import CSCDataset, CSRDataset, sparse_dataset from anndata._core import views from anndata.compat import ( Literal, @@ -88,7 +88,7 @@ def read_basic(elem): if isinstance(elem, Mapping): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: - return SparseDataset(elem).to_memory() + return sparse_dataset(elem).to_memory() return {k: read_elem(v) for k, v in elem.items()} elif isinstance(elem, h5py.Dataset): return h5ad.read_dataset(elem) # TODO: Handle legacy @@ -108,7 +108,7 @@ def read_basic_zarr(elem): if isinstance(elem, Mapping): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: - return SparseDataset(elem).to_memory() + return sparse_dataset(elem).to_memory() return {k: read_elem(v) for k, v in elem.items()} elif isinstance(elem, ZarrArray): return zarr.read_dataset(elem) # TODO: Handle legacy @@ -456,11 +456,13 @@ def write_sparse_compressed( ) -@_REGISTRY.register_write(H5Group, SparseDataset, IOSpec("", "0.1.0")) -@_REGISTRY.register_write(ZarrGroup, SparseDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(H5Group, CSRDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(H5Group, CSCDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(ZarrGroup, CSRDataset, IOSpec("", "0.1.0")) +@_REGISTRY.register_write(ZarrGroup, CSCDataset, IOSpec("", "0.1.0")) def write_sparse_dataset(f, k, elem, dataset_kwargs=MappingProxyType({})): write_sparse_compressed( - f, k, elem.to_backed(), fmt=elem.format_str, dataset_kwargs=dataset_kwargs + f, k, elem._to_backed(), fmt=elem.format_str, dataset_kwargs=dataset_kwargs ) # TODO: Cleaner way to do this f[k].attrs["encoding-type"] = f"{elem.format_str}_matrix" @@ -472,13 +474,13 @@ def write_sparse_dataset(f, k, elem, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse(elem): - return SparseDataset(elem).to_memory() + return sparse_dataset(elem).to_memory() @_REGISTRY.register_read_partial(H5Group, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read_partial(H5Group, IOSpec("csr_matrix", "0.1.0")) def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))): - return SparseDataset(elem)[indices] + return sparse_dataset(elem)[indices] ############## diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py index 24021a8ab..1e3a141ab 100644 --- a/anndata/_io/utils.py +++ b/anndata/_io/utils.py @@ -5,7 +5,7 @@ from packaging import version import h5py -from .._core.sparse_dataset import SparseDataset +from .._core.sparse_dataset import BaseCompressedSparseDataset # For allowing h5py v3 # https://github.com/scverse/anndata/issues/442 @@ -154,7 +154,7 @@ def _get_parent(elem): zarr = None if zarr and isinstance(elem, (zarr.Group, zarr.Array)): parent = elem.store # Not sure how to always get a name out of this - elif isinstance(elem, SparseDataset): + elif isinstance(elem, BaseCompressedSparseDataset): parent = elem.group.file.name else: parent = elem.file.name diff --git a/anndata/experimental/multi_files/_anncollection.py b/anndata/experimental/multi_files/_anncollection.py index 3ab7201cb..7b879e2ed 100644 --- a/anndata/experimental/multi_files/_anncollection.py +++ b/anndata/experimental/multi_files/_anncollection.py @@ -12,7 +12,7 @@ from ..._core.index import _normalize_indices, _normalize_index, Index from ..._core.views import _resolve_idx from ..._core.merge import concat_arrays, inner_concat_aligned_mapping -from ..._core.sparse_dataset import SparseDataset +from ..._core.sparse_dataset import BaseCompressedSparseDataset from ..._core.aligned_mapping import AxisArrays ATTRS = ["obs", "obsm", "layers"] @@ -361,7 +361,7 @@ def _gather_X(self): # todo: fix arr = X[oidx][:, vidx] Xs.append(arr if reverse is None else arr[reverse]) - elif isinstance(X, SparseDataset): + elif isinstance(X, BaseCompressedSparseDataset): # very slow indexing with two arrays if isinstance(vidx, slice) or len(vidx) <= 1000: Xs.append(X[oidx, vidx]) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index e3ded19ec..5e1e2b6dd 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -13,7 +13,7 @@ from anndata import AnnData, Raw from anndata._core.views import ArrayView -from anndata._core.sparse_dataset import SparseDataset +from anndata._core.sparse_dataset import BaseCompressedSparseDataset from anndata._core.aligned_mapping import AlignedMapping from anndata.utils import asarray @@ -309,7 +309,7 @@ def assert_equal_arrayview(a, b, exact=False, elem_name=None): assert_equal(asarray(a), asarray(b), exact=exact, elem_name=elem_name) -@assert_equal.register(SparseDataset) +@assert_equal.register(BaseCompressedSparseDataset) @assert_equal.register(sparse.spmatrix) def assert_equal_sparse(a, b, exact=False, elem_name=None): a = asarray(a) diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index 4d3eb72f6..a4f95080a 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -4,28 +4,38 @@ from scipy import sparse import anndata as ad -from anndata._core.sparse_dataset import SparseDataset +from anndata._core.sparse_dataset import sparse_dataset from anndata.tests.helpers import assert_equal, subset_func subset_func2 = subset_func +@pytest.fixture(params=["h5ad", "zarr"]) +def diskfmt(request): + return request.param + + @pytest.fixture(scope="function") -def ondisk_equivalent_adata(tmp_path): - csr_path = tmp_path / "csr.h5ad" - csc_path = tmp_path / "csc.h5ad" - dense_path = tmp_path / "dense.h5ad" +def ondisk_equivalent_adata(tmp_path, diskfmt): + csr_path = tmp_path / f"csr.{diskfmt}" + csc_path = tmp_path / f"csc.{diskfmt}" + dense_path = tmp_path / f"dense.{diskfmt}" + + read = lambda x, **kwargs: getattr(ad, f"read_{diskfmt}")(x, **kwargs) + write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs) csr_mem = ad.AnnData(X=sparse.random(50, 50, format="csr", density=0.1)) csc_mem = ad.AnnData(X=csr_mem.X.tocsc()) + dense_mem = ad.AnnData(X=csr_mem.X.toarray()) - csr_mem.write_h5ad(csr_path) - csc_mem.write_h5ad(csc_path) - csr_mem.write_h5ad(dense_path, as_dense="X") + write(csr_mem, csr_path) + write(csc_mem, csc_path) + # write(csr_mem, dense_path, as_dense="X") + write(dense_mem, dense_path) - csr_disk = ad.read_h5ad(csr_path, backed="r") - csc_disk = ad.read_h5ad(csc_path, backed="r") - dense_disk = ad.read_h5ad(dense_path, backed="r") + csr_disk = read(csr_path, backed="r") + csc_disk = read(csc_path, backed="r") + dense_disk = read(dense_path, backed="r") return csr_mem, csr_disk, csc_disk, dense_disk @@ -56,7 +66,7 @@ def test_dataset_append_memory(tmp_path, sparse_format, append_method): with h5py.File(h5_path, "a") as f: ad._io.specs.write_elem(f, "mtx", a) - diskmtx = SparseDataset(f["mtx"]) + diskmtx = sparse_dataset(f["mtx"]) diskmtx.append(b) fromdisk = diskmtx.to_memory() @@ -81,8 +91,8 @@ def test_dataset_append_disk(tmp_path, sparse_format, append_method): with h5py.File(h5_path, "a") as f: ad._io.specs.write_elem(f, "a", a) ad._io.specs.write_elem(f, "b", b) - a_disk = SparseDataset(f["a"]) - b_disk = SparseDataset(f["b"]) + a_disk = sparse_dataset(f["a"]) + b_disk = sparse_dataset(f["b"]) a_disk.append(b_disk) fromdisk = a_disk.to_memory() @@ -107,8 +117,8 @@ def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape): with h5py.File(h5_path, "a") as f: ad._io.specs.write_elem(f, "a", a_mem) ad._io.specs.write_elem(f, "b", b_mem) - a_disk = SparseDataset(f["a"]) - b_disk = SparseDataset(f["b"]) + a_disk = sparse_dataset(f["a"]) + b_disk = sparse_dataset(f["b"]) with pytest.raises(AssertionError): a_disk.append(b_disk) @@ -120,7 +130,7 @@ def test_wrong_formats(tmp_path): with h5py.File(h5_path, "a") as f: ad._io.specs.write_elem(f, "base", base) - disk_mtx = SparseDataset(f["base"]) + disk_mtx = sparse_dataset(f["base"]) pre_checks = disk_mtx.to_memory() with pytest.raises(ValueError): diff --git a/anndata/utils.py b/anndata/utils.py index bc00b0218..4ae4e1463 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -8,7 +8,7 @@ from scipy import sparse from .logging import get_logger -from ._core.sparse_dataset import SparseDataset +from ._core.sparse_dataset import BaseCompressedSparseDataset logger = get_logger(__name__) @@ -24,7 +24,7 @@ def asarray_sparse(x): return x.toarray() -@asarray.register(SparseDataset) +@asarray.register(BaseCompressedSparseDataset) def asarray_sparse_dataset(x): return asarray(x.value) From a5e03111cb811281a5f6355ebe3489e3dfda03c2 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 8 Nov 2022 20:55:14 +0100 Subject: [PATCH 002/125] Fix sparse_to_dense --- anndata/_core/file_backing.py | 19 ++++++++++++++++++- anndata/_io/h5ad.py | 7 +++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py index fc88fb5ad..dabfd90a7 100644 --- a/anndata/_core/file_backing.py +++ b/anndata/_core/file_backing.py @@ -8,7 +8,7 @@ from . import anndata from .sparse_dataset import BaseCompressedSparseDataset -from ..compat import ZarrArray, DaskArray +from ..compat import ZarrArray, ZarrGroup, DaskArray class AnnDataFileManager: @@ -127,3 +127,20 @@ def _(x, copy=True): @to_memory.register(Mapping) def _(x: Mapping, copy=True): return {k: to_memory(v, copy=copy) for k, v in x.items()} + + +@singledispatch +def filename(x): + raise NotImplementedError(f"Not implemented for {type(x)}") + + +@filename.register(h5py.Group) +@filename.register(h5py.Dataset) +def _(x): + return x.file.filename + + +@filename.register(ZarrArray) +@filename.register(ZarrGroup) +def _(x): + return x.store.path diff --git a/anndata/_io/h5ad.py b/anndata/_io/h5ad.py index 2817b549d..4f77ed12d 100644 --- a/anndata/_io/h5ad.py +++ b/anndata/_io/h5ad.py @@ -12,7 +12,7 @@ from scipy import sparse from .._core.sparse_dataset import BaseCompressedSparseDataset -from .._core.file_backing import AnnDataFileManager +from .._core.file_backing import AnnDataFileManager, filename from .._core.anndata import AnnData from ..compat import ( _from_fixed_length_strings, @@ -110,9 +110,8 @@ def write_h5ad( def write_sparse_as_dense(f, key, value, dataset_kwargs=MappingProxyType({})): real_key = None # Flag for if temporary key was used if key in f: - if ( - isinstance(value, (h5py.Group, h5py.Dataset, BaseCompressedSparseDataset)) - and value.file.filename == f.file.filename + if isinstance(value, BaseCompressedSparseDataset) and ( + filename(value.group) == filename(f) ): # Write to temporary key before overwriting real_key = key # Transform key to temporary, e.g. raw/X -> raw/_X, or X -> _X From 5e3cb02c53a3228cc8204a77f5b2b97e3ab8cc2c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 Feb 2023 17:40:29 +0000 Subject: [PATCH 003/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/sparse_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 7e49484f1..ddfa30c99 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -549,7 +549,6 @@ class CSCDataset(BaseCompressedSparseDataset): def sparse_dataset(group) -> BaseCompressedSparseDataset: - # encoding_type = _read_attr(group, "encoding-type") encoding_type = _get_group_format(group) if encoding_type == "csr": From 3ee693cb20308c7645e2a87f483ca0ee87b689fc Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 21 Nov 2022 21:31:27 +0100 Subject: [PATCH 004/125] Start write_dispatched --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4053f744f..573beaaaa 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Temp files .DS_Store *~ +venv/ # Compiled files __pycache__/ From 7e0825a3d9d0833e3331419a44085a73cc72bf1a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 2 Feb 2023 17:13:39 +0100 Subject: [PATCH 005/125] (wip): remote reading via new AxisArrays and AnnData object --- anndata/experimental/read_remote/__init__.py | 1 + .../experimental/read_remote/read_remote.py | 650 ++++++++++++++++++ anndata/experimental/read_remote/utils.py | 18 + 3 files changed, 669 insertions(+) create mode 100644 anndata/experimental/read_remote/__init__.py create mode 100644 anndata/experimental/read_remote/read_remote.py create mode 100644 anndata/experimental/read_remote/utils.py diff --git a/anndata/experimental/read_remote/__init__.py b/anndata/experimental/read_remote/__init__.py new file mode 100644 index 000000000..3d20b9dc1 --- /dev/null +++ b/anndata/experimental/read_remote/__init__.py @@ -0,0 +1 @@ +from .utils import read_dispatched, write_dispatched \ No newline at end of file diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py new file mode 100644 index 000000000..7bcf78503 --- /dev/null +++ b/anndata/experimental/read_remote/read_remote.py @@ -0,0 +1,650 @@ +from collections import OrderedDict, abc as cabc +from pathlib import Path +from typing import Any, MutableMapping, Union, List +from anndata._core.aligned_mapping import Layers, PairwiseArrays +from anndata._core.anndata import StorageType, _check_2d_shape +from anndata._core.index import Index +from anndata._core.raw import Raw +from anndata.compat import _move_adj_mtx +from anndata.utils import convert_to_dict + +import zarr +import pandas as pd + +from ..._core import AnnData, AxisArrays +from .utils import read_dispatched + +class AxisArraysRemote(AxisArrays): + def __getitem__(self, key: str): + return self._data[key][()] + + def __getattr__(self, __name: str): + # If we a method has been accessed that is not here, try the pandas implementation + if hasattr(pd.DataFrame, __name): + return self.to_df().__getattribute(__name) + return object.__getattribute__(self, __name) + + + +class AnnDataRemote(AnnData): + + def __init__( + self, + X = None, + obs = None, + var = None, + uns = None, + obsm = None, + varm = None, + layers = None, + raw = None, + dtype = None, + shape = None, + filename = None, + filemode = None, + asview = False, + *, + obsp, + varp, + oidx, + vidx, + ): + + # view attributes + self._is_view = False + self._adata_ref = None + self._oidx = None + self._vidx = None + + # ---------------------------------------------------------------------- + # various ways of initializing the data + # ---------------------------------------------------------------------- + + # check data type of X + if X is not None: + for s_type in StorageType: + if isinstance(X, s_type.value): + break + else: + class_names = ", ".join(c.__name__ for c in StorageType.classes()) + raise ValueError( + f"`X` needs to be of one of {class_names}, not {type(X)}." + ) + if shape is not None: + raise ValueError("`shape` needs to be `None` if `X` is not `None`.") + _check_2d_shape(X) + # if type doesn’t match, a copy is made, otherwise, use a view + if dtype is not None: + X = X.astype(dtype) + # data matrix and shape + self._X = X + self._n_obs, self._n_vars = self._X.shape + else: + self._X = None + self._n_obs = len([] if obs is None else obs) + self._n_vars = len([] if var is None else var) + # check consistency with shape + if shape is not None: + if self._n_obs == 0: + self._n_obs = shape[0] + else: + if self._n_obs != shape[0]: + raise ValueError("`shape` is inconsistent with `obs`") + if self._n_vars == 0: + self._n_vars = shape[1] + else: + if self._n_vars != shape[1]: + raise ValueError("`shape` is inconsistent with `var`") + + # annotations + self._obs = AxisArrays(self, 0, vals=convert_to_dict(obs)) + self._var = AxisArrays(self, 0, vals=convert_to_dict(var)) + + # now we can verify if indices match! + # for attr_name, x_name, idx in x_indices: + # attr = getattr(self, attr_name) + # if isinstance(attr.index, pd.RangeIndex): + # attr.index = idx + # elif not idx.equals(attr.index): + # raise ValueError(f"Index of {attr_name} must match {x_name} of X.") + + # unstructured annotations + self.uns = uns or OrderedDict() + + # TODO: Think about consequences of making obsm a group in hdf + self._obsm = AxisArrays(self, 0, vals=convert_to_dict(obsm)) + self._varm = AxisArrays(self, 1, vals=convert_to_dict(varm)) + + self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp)) + self._varp = PairwiseArrays(self, 1, vals=convert_to_dict(varp)) + + # Backwards compat for connectivities matrices in uns["neighbors"] + _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) + + # self._check_dimensions() + # self._check_uniqueness() + + if self.filename: + assert not isinstance( + raw, Raw + ), "got raw from other adata but also filename?" + if {"raw", "raw.X"} & set(self.file): + raw = dict(X=None, **raw) + if not raw: + self._raw = None + elif isinstance(raw, cabc.Mapping): + self._raw = Raw(self, **raw) + else: # is a Raw from another AnnData + self._raw = Raw(self, raw._X, raw.var, raw.varm) + + # clean up old formats + self._clean_up_old_format(uns) + + # layers + self._layers = Layers(self, layers) + + + def __eq__(self, other): + """Equality testing""" + raise NotImplementedError( + "Equality comparisons are not supported for AnnData objects, " + "instead compare the desired attributes." + ) + + @property + def obs_names(self) -> pd.Index: + """Names of observations (alias for `.obs.index`).""" + return pd.Index(self.obs['_index']) + + @property + def var_names(self) -> pd.Index: + """Names of variables (alias for `.var.index`).""" + return pd.Index(self.var['_index']) + + def obs_keys(self) -> List[str]: + """List keys of observation annotation :attr:`obs`.""" + return self._obs.keys() + + def var_keys(self) -> List[str]: + """List keys of variable annotation :attr:`var`.""" + return self._var.keys() + + # TODO: this is not quite complete... + def __delitem__(self, index: Index): + obs, var = self._normalize_indices(index) + # TODO: does this really work? + if not self.isbacked: + del self._X[obs, var] + else: + X = self.file["X"] + del X[obs, var] + self._set_backed("X", X) + + # def obs_vector(self, k: str, *, layer: Optional[str] = None) -> np.ndarray: + # """\ + # Convenience function for returning a 1 dimensional ndarray of values + # from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. + + # Made for convenience, not performance. + # Intentionally permissive about arguments, for easy iterative use. + + # Params + # ------ + # k + # Key to use. Should be in :attr:`var_names` or :attr:`obs`\\ `.columns`. + # layer + # What layer values should be returned from. If `None`, :attr:`X` is used. + + # Returns + # ------- + # A one dimensional nd array, with values for each obs in the same order + # as :attr:`obs_names`. + # """ + # if layer == "X": + # if "X" in self.layers: + # pass + # else: + # warnings.warn( + # "In a future version of AnnData, access to `.X` by passing" + # " `layer='X'` will be removed. Instead pass `layer=None`.", + # FutureWarning, + # ) + # layer = None + # return get_vector(self, k, "obs", "var", layer=layer) + + # def var_vector(self, k, *, layer: Optional[str] = None) -> np.ndarray: + # """\ + # Convenience function for returning a 1 dimensional ndarray of values + # from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. + + # Made for convenience, not performance. Intentionally permissive about + # arguments, for easy iterative use. + + # Params + # ------ + # k + # Key to use. Should be in :attr:`obs_names` or :attr:`var`\\ `.columns`. + # layer + # What layer values should be returned from. If `None`, :attr:`X` is used. + + # Returns + # ------- + # A one dimensional nd array, with values for each var in the same order + # as :attr:`var_names`. + # """ + # if layer == "X": + # if "X" in self.layers: + # pass + # else: + # warnings.warn( + # "In a future version of AnnData, access to `.X` by passing " + # "`layer='X'` will be removed. Instead pass `layer=None`.", + # FutureWarning, + # ) + # layer = None + # # return get_vector(self, k, "var", "obs", layer=layer) + + # def to_memory(self, copy=True) -> "AnnData": + # """Return a new AnnData object with all backed arrays loaded into memory. + + # Params + # ------ + # copy: + # Whether the arrays that are already in-memory should be copied. + + # Example + # ------- + + # .. code:: python + + # import anndata + # backed = anndata.read_h5ad("file.h5ad", backed="r") + # mem = backed[backed.obs["cluster"] == "a", :].to_memory() + # """ + # new = {} + # for attr_name in [ + # "X", + # "obs", + # "var", + # "obsm", + # "varm", + # "obsp", + # "varp", + # "layers", + # "uns", + # ]: + # attr = getattr(self, attr_name, None) + # if attr is not None: + # new[attr_name] = to_memory(attr, copy) + + # if self.raw is not None: + # new["raw"] = { + # "X": to_memory(self.raw.X, copy), + # "var": to_memory(self.raw.var, copy), + # "varm": to_memory(self.raw.varm, copy), + # } + + # if self.isbacked: + # self.file.close() + + # # return AnnData(**new) + + # def concatenate( + # self, + # *adatas: "AnnData", + # join: str = "inner", + # batch_key: str = "batch", + # batch_categories: Sequence[Any] = None, + # uns_merge: Optional[str] = None, + # index_unique: Optional[str] = "-", + # fill_value=None, + # ) -> "AnnData": + # """\ + # Concatenate along the observations axis. + + # The :attr:`uns`, :attr:`varm` and :attr:`obsm` attributes are ignored. + + # Currently, this works only in `'memory'` mode. + + # .. note:: + + # For more flexible and efficient concatenation, see: :func:`~anndata.concat`. + + # Parameters + # ---------- + # adatas + # AnnData matrices to concatenate with. Each matrix is referred to as + # a “batch”. + # join + # Use intersection (`'inner'`) or union (`'outer'`) of variables. + # batch_key + # Add the batch annotation to :attr:`obs` using this key. + # batch_categories + # Use these as categories for the batch annotation. By default, use increasing numbers. + # uns_merge + # Strategy to use for merging entries of uns. These strategies are applied recusivley. + # Currently implemented strategies include: + + # * `None`: The default. The concatenated object will just have an empty dict for `uns`. + # * `"same"`: Only entries which have the same value in all AnnData objects are kept. + # * `"unique"`: Only entries which have one unique value in all AnnData objects are kept. + # * `"first"`: The first non-missing value is used. + # * `"only"`: A value is included if only one of the AnnData objects has a value at this + # path. + # index_unique + # Make the index unique by joining the existing index names with the + # batch category, using `index_unique='-'`, for instance. Provide + # `None` to keep existing indices. + # fill_value + # Scalar value to fill newly missing values in arrays with. Note: only applies to arrays + # and sparse matrices (not dataframes) and will only be used if `join="outer"`. + + # .. note:: + # If not provided, the default value is `0` for sparse matrices and `np.nan` + # for numpy arrays. See the examples below for more information. + + # Returns + # ------- + # :class:`~anndata.AnnData` + # The concatenated :class:`~anndata.AnnData`, where `adata.obs[batch_key]` + # stores a categorical variable labeling the batch. + + # Notes + # ----- + + # .. warning:: + + # If you use `join='outer'` this fills 0s for sparse data when + # variables are absent in a batch. Use this with care. Dense data is + # filled with `NaN`. See the examples. + + # Examples + # -------- + # Joining on intersection of variables. + + # >>> adata1 = AnnData( + # ... np.array([[1, 2, 3], [4, 5, 6]]), + # ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), + # ... dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]), + # ... ) + # >>> adata2 = AnnData( + # ... np.array([[1, 2, 3], [4, 5, 6]]), + # ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), + # ... dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), + # ... ) + # >>> adata3 = AnnData( + # ... np.array([[1, 2, 3], [4, 5, 6]]), + # ... dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']), + # ... dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]), + # ... ) + # >>> adata = adata1.concatenate(adata2, adata3) + # >>> adata + # AnnData object with n_obs × n_vars = 6 × 2 + # obs: 'anno1', 'anno2', 'batch' + # var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' + # >>> adata.X + # array([[2, 3], + # [5, 6], + # [3, 2], + # [6, 5], + # [3, 2], + # [6, 5]]) + # >>> adata.obs + # anno1 anno2 batch + # s1-0 c1 NaN 0 + # s2-0 c2 NaN 0 + # s3-1 c3 NaN 1 + # s4-1 c4 NaN 1 + # s1-2 NaN d3 2 + # s2-2 NaN d4 2 + # >>> adata.var.T + # b c + # annoA-0 1 2 + # annoA-1 2 1 + # annoA-2 3 2 + # annoB-2 2 1 + + # Joining on the union of variables. + + # >>> outer = adata1.concatenate(adata2, adata3, join='outer') + # >>> outer + # AnnData object with n_obs × n_vars = 6 × 4 + # obs: 'anno1', 'anno2', 'batch' + # var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' + # >>> outer.var.T + # a b c d + # annoA-0 0.0 1.0 2.0 NaN + # annoA-1 NaN 2.0 1.0 0.0 + # annoA-2 NaN 3.0 2.0 0.0 + # annoB-2 NaN 2.0 1.0 0.0 + # >>> outer.var_names + # Index(['a', 'b', 'c', 'd'], dtype='object') + # >>> outer.X + # array([[ 1., 2., 3., nan], + # [ 4., 5., 6., nan], + # [nan, 3., 2., 1.], + # [nan, 6., 5., 4.], + # [nan, 3., 2., 1.], + # [nan, 6., 5., 4.]]) + # >>> outer.X.sum(axis=0) + # array([nan, 25., 23., nan]) + # >>> import pandas as pd + # >>> Xdf = pd.DataFrame(outer.X, columns=outer.var_names) + # >>> Xdf + # a b c d + # 0 1.0 2.0 3.0 NaN + # 1 4.0 5.0 6.0 NaN + # 2 NaN 3.0 2.0 1.0 + # 3 NaN 6.0 5.0 4.0 + # 4 NaN 3.0 2.0 1.0 + # 5 NaN 6.0 5.0 4.0 + # >>> Xdf.sum() + # a 5.0 + # b 25.0 + # c 23.0 + # d 10.0 + # dtype: float64 + + # One way to deal with missing values is to use masked arrays: + + # >>> from numpy import ma + # >>> outer.X = ma.masked_invalid(outer.X) + # >>> outer.X + # masked_array( + # data=[[1.0, 2.0, 3.0, --], + # [4.0, 5.0, 6.0, --], + # [--, 3.0, 2.0, 1.0], + # [--, 6.0, 5.0, 4.0], + # [--, 3.0, 2.0, 1.0], + # [--, 6.0, 5.0, 4.0]], + # mask=[[False, False, False, True], + # [False, False, False, True], + # [ True, False, False, False], + # [ True, False, False, False], + # [ True, False, False, False], + # [ True, False, False, False]], + # fill_value=1e+20) + # >>> outer.X.sum(axis=0).data + # array([ 5., 25., 23., 10.]) + + # The masked array is not saved but has to be reinstantiated after saving. + + # >>> outer.write('./test.h5ad') + # >>> from anndata import read_h5ad + # >>> outer = read_h5ad('./test.h5ad') + # >>> outer.X + # array([[ 1., 2., 3., nan], + # [ 4., 5., 6., nan], + # [nan, 3., 2., 1.], + # [nan, 6., 5., 4.], + # [nan, 3., 2., 1.], + # [nan, 6., 5., 4.]]) + + # For sparse data, everything behaves similarly, + # except that for `join='outer'`, zeros are added. + + # >>> from scipy.sparse import csr_matrix + # >>> adata1 = AnnData( + # ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), + # ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), + # ... dict(var_names=['a', 'b', 'c']), + # ... ) + # >>> adata2 = AnnData( + # ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), + # ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), + # ... dict(var_names=['d', 'c', 'b']), + # ... ) + # >>> adata3 = AnnData( + # ... csr_matrix([[1, 2, 0], [0, 5, 6]], dtype=np.float32), + # ... dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']), + # ... dict(var_names=['d', 'c', 'b']), + # ... ) + # >>> adata = adata1.concatenate(adata2, adata3, join='outer') + # >>> adata.var_names + # Index(['a', 'b', 'c', 'd'], dtype='object') + # >>> adata.X.toarray() + # array([[0., 2., 3., 0.], + # [0., 5., 6., 0.], + # [0., 3., 2., 0.], + # [0., 6., 5., 0.], + # [0., 0., 2., 1.], + # [0., 6., 5., 0.]], dtype=float32) + # """ + # from .merge import concat, merge_outer, merge_dataframes, merge_same + + # warnings.warn( + # "The AnnData.concatenate method is deprecated in favour of the " + # "anndata.concat function. Please use anndata.concat instead.\n\n" + # "See the tutorial for concat at: " + # "https://anndata.readthedocs.io/en/latest/concatenation.html", + # FutureWarning, + # ) + + # if self.isbacked: + # raise ValueError("Currently, concatenate only works in memory mode.") + + # if len(adatas) == 0: + # return self.copy() + # elif len(adatas) == 1 and not isinstance(adatas[0], AnnData): + # adatas = adatas[0] # backwards compatibility + # all_adatas = (self,) + tuple(adatas) + + # out = concat( + # all_adatas, + # axis=0, + # join=join, + # label=batch_key, + # keys=batch_categories, + # uns_merge=uns_merge, + # fill_value=fill_value, + # index_unique=index_unique, + # pairwise=False, + # ) + + # # Backwards compat (some of this could be more efficient) + # # obs used to always be an outer join + # out.obs = concat( + # [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], + # axis=0, + # join="outer", + # label=batch_key, + # keys=batch_categories, + # index_unique=index_unique, + # ).obs + # # Removing varm + # del out.varm + # # Implementing old-style merging of var + # if batch_categories is None: + # batch_categories = np.arange(len(all_adatas)).astype(str) + # pat = rf"-({'|'.join(batch_categories)})$" + # out.var = merge_dataframes( + # [a.var for a in all_adatas], + # out.var_names, + # partial(merge_outer, batch_keys=batch_categories, merge=merge_same), + # ) + # out.var = out.var.iloc[ + # :, + # ( + # out.var.columns.str.extract(pat, expand=False) + # .fillna("") + # .argsort(kind="stable") + # ), + # ] + + # return out + + # def var_names_make_unique(self, join: str = "-"): + # # Important to go through the setter so obsm dataframes are updated too + # self.var_names = utils.make_index_unique(self.var.index, join) + + # var_names_make_unique.__doc__ = utils.make_index_unique.__doc__ + + # def obs_names_make_unique(self, join: str = "-"): + # # Important to go through the setter so obsm dataframes are updated too + # self.obs_names = utils.make_index_unique(self.obs.index, join) + + # obs_names_make_unique.__doc__ = utils.make_index_unique.__doc__ + + def __contains__(self, key: Any): + raise AttributeError( + "AnnData has no attribute __contains__, don;t check `in adata`." + ) + + # def _check_dimensions(self, key=None): + # if key is None: + # key = {"obs", "var", "obsm", "varm"} + # else: + # key = {key} + # if "obs" in key and len(self._obs) != self._n_obs: + # raise ValueError( + # "Observations annot. `obs` must have number of rows of `X`" + # f" ({self._n_obs}), but has {self._obs.shape[0]} rows." + # ) + # if "var" in key and len(self._var) != self._n_vars: + # raise ValueError( + # "Variables annot. `var` must have number of columns of `X`" + # f" ({self._n_vars}), but has {self._var.shape[0]} rows." + # ) + # if "obsm" in key: + # obsm = self._obsm + # if ( + # not all([o.shape[0] == self._n_obs for o in obsm.values()]) + # and len(obsm.dim_names) != self._n_obs + # ): + # raise ValueError( + # "Observations annot. `obsm` must have number of rows of `X`" + # f" ({self._n_obs}), but has {len(obsm)} rows." + # ) + # if "varm" in key: + # varm = self._varm + # if ( + # not all([v.shape[0] == self._n_vars for v in varm.values()]) + # and len(varm.dim_names) != self._n_vars + # ): + # raise ValueError( + # "Variables annot. `varm` must have number of columns of `X`" + # f" ({self._n_vars}), but has {len(varm)} rows." + # ) + + +def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: + if isinstance(store, Path): + store = str(store) + + f = zarr.open(store, mode="r") + + def callback(func, elem_name: str, elem, iospec): + if iospec.encoding_type == "anndata" or elem_name.endswith('/'): + return AnnData( + **{k: read_dispatched(v, callback) for k, v in elem.items()} + ) + elif elem_name.startswith("raw."): + return None + elif elem_name in {"obs", "var"}: + # override to only return AxisArray that will be accessed specially via our special AnnData object + return {k: func(v) for k, v in elem.items()} + return func(elem) + + adata = read_dispatched(f, callback=callback) + + return adata \ No newline at end of file diff --git a/anndata/experimental/read_remote/utils.py b/anndata/experimental/read_remote/utils.py new file mode 100644 index 000000000..79d801ef6 --- /dev/null +++ b/anndata/experimental/read_remote/utils.py @@ -0,0 +1,18 @@ +from types import MappingProxyType + +from anndata._io.specs import read_elem, write_elem + +def read_dispatched(store, callback): + from anndata._io.specs import Reader, _REGISTRY + + reader = Reader(_REGISTRY, callback=callback) + + return reader.read_elem(store) + + +def write_dispatched(store, key, elem, callback, dataset_kwargs=MappingProxyType({})): + from anndata._io.specs import Writer, _REGISTRY + + writer = Writer(_REGISTRY, callback=callback) + + writer.write_elem(store, key, elem, dataset_kwargs=dataset_kwargs) \ No newline at end of file From 0b8723037cacaaf856a58894d7ad9583af41607d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 6 Feb 2023 12:11:51 +0100 Subject: [PATCH 006/125] (chore): rename --- anndata/experimental/read_remote/read_remote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 7bcf78503..5870ece62 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -14,14 +14,14 @@ from ..._core import AnnData, AxisArrays from .utils import read_dispatched -class AxisArraysRemote(AxisArrays): +class SingleDimensionAxisArraysRemote(AxisArrays): def __getitem__(self, key: str): return self._data[key][()] def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation if hasattr(pd.DataFrame, __name): - return self.to_df().__getattribute(__name) + return self.to_df().__getattribute__(__name) return object.__getattribute__(self, __name) From f2de515bde5b27e827d9d1405d5b914a9d081efe Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 15 Feb 2023 11:25:27 +0100 Subject: [PATCH 007/125] (chore): `venv` to `.gitignore` --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 573beaaaa..71cea7fa8 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,5 @@ test.h5ad /.idea/ /.vscode/ +/venv/ + From 7bc0f760823699e7e6694e40ccc1e0e3550a39c7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 29 Nov 2022 17:01:45 +0100 Subject: [PATCH 008/125] (fix): `concatenation` test --- docs/concatenation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/concatenation.rst b/docs/concatenation.rst index 0b4828057..bcc4e4c10 100644 --- a/docs/concatenation.rst +++ b/docs/concatenation.rst @@ -26,7 +26,7 @@ Let's start off with an example: AnnData object with n_obs × n_vars = 700 × 765 obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain' var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable' - uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups' + uns: 'bulk_labels_categories', 'bulk_labels_colors', 'louvain', 'louvain_categories', 'louvain_colors', 'neighbors', 'pca', 'phase_categories', 'rank_genes_groups' obsm: 'X_pca', 'X_umap' varm: 'PCs' obsp: 'distances', 'connectivities' From 7a12515d12684fb0eca69090fc5128cee8b6d40c Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 25 Jan 2023 16:02:36 +0100 Subject: [PATCH 009/125] Revert changes to some backwards compat tests --- docs/concatenation.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/concatenation.rst b/docs/concatenation.rst index bcc4e4c10..0b4828057 100644 --- a/docs/concatenation.rst +++ b/docs/concatenation.rst @@ -26,7 +26,7 @@ Let's start off with an example: AnnData object with n_obs × n_vars = 700 × 765 obs: 'bulk_labels', 'n_genes', 'percent_mito', 'n_counts', 'S_score', 'G2M_score', 'phase', 'louvain' var: 'n_counts', 'means', 'dispersions', 'dispersions_norm', 'highly_variable' - uns: 'bulk_labels_categories', 'bulk_labels_colors', 'louvain', 'louvain_categories', 'louvain_colors', 'neighbors', 'pca', 'phase_categories', 'rank_genes_groups' + uns: 'bulk_labels_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca', 'rank_genes_groups' obsm: 'X_pca', 'X_umap' varm: 'PCs' obsp: 'distances', 'connectivities' From 49e8069e16f24c985ca953113e3975381b57881b Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 25 Jan 2023 17:56:56 +0100 Subject: [PATCH 010/125] Start fixing error reporting To fix error reporting, I've put the attempt to catch an error during IO on top of the `read_elem` method. Since the decorator is sometimes used on functions, I modified it to be able to handle the signature of both a method and a function. What's weird is that sometimes the decorator is being passed the arguments of a method, that has a name like a method, but is a function. So that still needs to be fixed. --- anndata/_io/utils.py | 54 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py index 388f9f050..b661f99cd 100644 --- a/anndata/_io/utils.py +++ b/anndata/_io/utils.py @@ -179,6 +179,7 @@ def report_read_key_on_error(func): >>> z["X"] = [1, 2, 3] >>> read_arr(z["X"]) # doctest: +SKIP """ + from inspect import ismethod def re_raise_error(e, elem): if isinstance(e, AnnDataReadError): @@ -203,7 +204,34 @@ def func_wrapper(*args, **kwargs): except Exception as e: re_raise_error(e, elem) - return func_wrapper + if ismethod(func): + + @wraps(func) + def method_wrapper(self, elem, *args, **kwargs): + try: + return func(self, elem, *args, **kwargs) + except Exception as e: + re_raise_error(e, elem) + + return method_wrapper + + else: + + # TODO: sometimes, something that looks an awful lot like a method is reaching here + # It's being passed an instance of a Reader, has a signature Reader.read_elem, but is a function + @wraps(func) + def func_wrapper(*args, **kwargs): + from anndata._io.specs import Reader + + for elem in args: + if not isinstance(elem, Reader): + break + try: + return func(*args, **kwargs) + except Exception as e: + re_raise_error(e, elem) + + return func_wrapper def report_write_key_on_error(func): @@ -220,6 +248,30 @@ def report_write_key_on_error(func): >>> X = [1, 2, 3] >>> write_arr(z, "X", X) # doctest: +SKIP """ + from inspect import ismethod + + def re_raise_error(e, elem, key): + if "Above error raised while writing key" in format(e): + raise + else: + parent = _get_parent(elem) + raise type(e)( + f"{e}\n\n" + f"Above error raised while writing key {key!r} of {type(elem)} " + f"to {parent}" + ) from e + + # Need to specialize for method signature + if ismethod(func): + + @wraps(func) + def func_wrapper(self, elem, key, val, *args, **kwargs): + try: + return func(self, elem, key, val, *args, **kwargs) + except Exception as e: + re_raise_error(e, elem, key) + + else: def re_raise_error(e, elem, key): if "Above error raised while writing key" in format(e): From c3a5e07a961bcb0b69d6a465e7560fa23d1f4aff Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 8 Feb 2023 17:16:22 +0100 Subject: [PATCH 011/125] Fixes after merge --- anndata/_io/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py index b661f99cd..4a310ae32 100644 --- a/anndata/_io/utils.py +++ b/anndata/_io/utils.py @@ -216,7 +216,6 @@ def method_wrapper(self, elem, *args, **kwargs): return method_wrapper else: - # TODO: sometimes, something that looks an awful lot like a method is reaching here # It's being passed an instance of a Reader, has a signature Reader.read_elem, but is a function @wraps(func) From f22660d54591ab366a95ab252536974871d9e16f Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 14 Feb 2023 14:03:17 +0100 Subject: [PATCH 012/125] Clean up error reporting + remove commented out code --- anndata/_io/utils.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py index 4a310ae32..d4c2ba535 100644 --- a/anndata/_io/utils.py +++ b/anndata/_io/utils.py @@ -179,7 +179,6 @@ def report_read_key_on_error(func): >>> z["X"] = [1, 2, 3] >>> read_arr(z["X"]) # doctest: +SKIP """ - from inspect import ismethod def re_raise_error(e, elem): if isinstance(e, AnnDataReadError): @@ -222,15 +221,16 @@ def method_wrapper(self, elem, *args, **kwargs): def func_wrapper(*args, **kwargs): from anndata._io.specs import Reader - for elem in args: - if not isinstance(elem, Reader): - break - try: - return func(*args, **kwargs) - except Exception as e: - re_raise_error(e, elem) + # Figure out signature (method vs function) by going through args + for elem in args: + if not isinstance(elem, Reader): + break + try: + return func(*args, **kwargs) + except Exception as e: + re_raise_error(e, elem) - return func_wrapper + return func_wrapper def report_write_key_on_error(func): @@ -247,7 +247,6 @@ def report_write_key_on_error(func): >>> X = [1, 2, 3] >>> write_arr(z, "X", X) # doctest: +SKIP """ - from inspect import ismethod def re_raise_error(e, elem, key): if "Above error raised while writing key" in format(e): From 93b87787cb771055be6e2418d0dd62882328271e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 17 Feb 2023 10:38:41 +0100 Subject: [PATCH 013/125] (wip): semi-working demo? --- anndata/_core/__init__.py | 2 + anndata/_core/sparse_dataset.py | 6 +- anndata/_io/specs/methods.py | 2 +- .../experimental/read_remote/read_remote.py | 59 ++++++++++++------- 4 files changed, 43 insertions(+), 26 deletions(-) diff --git a/anndata/_core/__init__.py b/anndata/_core/__init__.py index e69de29bb..31018446e 100644 --- a/anndata/_core/__init__.py +++ b/anndata/_core/__init__.py @@ -0,0 +1,2 @@ +from .anndata import AnnData +from .aligned_mapping import AxisArrays \ No newline at end of file diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 8654e8dd7..e9c01d8f8 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -376,9 +376,9 @@ def to_backed(self) -> BackedSparseMatrix: def to_memory(self) -> ss.spmatrix: format_class = get_memory_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"][...] - mtx.indices = self.group["indices"][...] - mtx.indptr = self.group["indptr"][...] + mtx.data = self.group["data"] + mtx.indices = self.group["indices"] + mtx.indptr = self.group["indptr"] return mtx diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 9584f82d1..caa34e0bc 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -320,7 +320,7 @@ def write_basic_dask(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) def read_array(elem, _reader): - return elem[()] + return elem @_REGISTRY.register_read_partial(H5Array, IOSpec("array", "0.2.0")) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 5870ece62..000cf2562 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -1,8 +1,9 @@ from collections import OrderedDict, abc as cabc from pathlib import Path -from typing import Any, MutableMapping, Union, List +from typing import Any, MutableMapping, Union, List, Sequence from anndata._core.aligned_mapping import Layers, PairwiseArrays -from anndata._core.anndata import StorageType, _check_2d_shape +from anndata._core.anndata import StorageType, _check_2d_shape, _gen_dataframe +from anndata._core.file_backing import AnnDataFileManager from anndata._core.index import Index from anndata._core.raw import Raw from anndata.compat import _move_adj_mtx @@ -44,10 +45,10 @@ def __init__( filemode = None, asview = False, *, - obsp, - varp, - oidx, - vidx, + obsp = None, + varp = None, + oidx = None, + vidx = None, ): # view attributes @@ -61,6 +62,11 @@ def __init__( # ---------------------------------------------------------------------- # check data type of X + if filename is not None: + self.file = AnnDataFileManager(self, filename, filemode) + else: + self.file = AnnDataFileManager(self, None) + if X is not None: for s_type in StorageType: if isinstance(X, s_type.value): @@ -96,9 +102,11 @@ def __init__( if self._n_vars != shape[1]: raise ValueError("`shape` is inconsistent with `var`") - # annotations + # annotations - need names already for AxisArrays to work. + self.obs_names = pd.Index(obs['index'][()]) + self.var_names = pd.Index(var['index'][()]) self._obs = AxisArrays(self, 0, vals=convert_to_dict(obs)) - self._var = AxisArrays(self, 0, vals=convert_to_dict(var)) + self._var = AxisArrays(self, 1, vals=convert_to_dict(var)) # now we can verify if indices match! # for attr_name, x_name, idx in x_indices: @@ -151,16 +159,6 @@ def __eq__(self, other): "instead compare the desired attributes." ) - @property - def obs_names(self) -> pd.Index: - """Names of observations (alias for `.obs.index`).""" - return pd.Index(self.obs['_index']) - - @property - def var_names(self) -> pd.Index: - """Names of variables (alias for `.var.index`).""" - return pd.Index(self.var['_index']) - def obs_keys(self) -> List[str]: """List keys of observation annotation :attr:`obs`.""" return self._obs.keys() @@ -180,6 +178,22 @@ def __delitem__(self, index: Index): del X[obs, var] self._set_backed("X", X) + @property + def obs_names(self) -> pd.Index: + return self._obs_names + + @property + def var_names(self) -> pd.Index: + return self._var_names + + @obs_names.setter + def obs_names(self, names: Sequence[str]): + self._obs_names = names + + @var_names.setter + def var_names(self, names: Sequence[str]): + self._var_names = names + # def obs_vector(self, k: str, *, layer: Optional[str] = None) -> np.ndarray: # """\ # Convenience function for returning a 1 dimensional ndarray of values @@ -631,18 +645,19 @@ def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: if isinstance(store, Path): store = str(store) - f = zarr.open(store, mode="r") + f = zarr.open_consolidated(store, mode="r") def callback(func, elem_name: str, elem, iospec): + print(elem_name) if iospec.encoding_type == "anndata" or elem_name.endswith('/'): - return AnnData( + return AnnDataRemote( **{k: read_dispatched(v, callback) for k, v in elem.items()} ) elif elem_name.startswith("raw."): return None - elif elem_name in {"obs", "var"}: + elif elem_name in {"/obs", "/var"}: # override to only return AxisArray that will be accessed specially via our special AnnData object - return {k: func(v) for k, v in elem.items()} + return {k: read_dispatched(v, callback) for k, v in elem.items()} return func(elem) adata = read_dispatched(f, callback=callback) From 6d32d8e9c3e9b1671ab15052a0c0a5d74a704374 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Feb 2023 15:51:38 +0100 Subject: [PATCH 014/125] (chore): compat for old index key --- anndata/experimental/read_remote/read_remote.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 000cf2562..ec9dd6d18 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -103,8 +103,8 @@ def __init__( raise ValueError("`shape` is inconsistent with `var`") # annotations - need names already for AxisArrays to work. - self.obs_names = pd.Index(obs['index'][()]) - self.var_names = pd.Index(var['index'][()]) + self.obs_names = pd.Index((obs['index'] if 'index' in obs else obs['_index'])[()]) + self.var_names = pd.Index((var['index'] if 'index' in var else var['_index'])[()]) self._obs = AxisArrays(self, 0, vals=convert_to_dict(obs)) self._var = AxisArrays(self, 1, vals=convert_to_dict(var)) @@ -648,12 +648,11 @@ def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: f = zarr.open_consolidated(store, mode="r") def callback(func, elem_name: str, elem, iospec): - print(elem_name) if iospec.encoding_type == "anndata" or elem_name.endswith('/'): return AnnDataRemote( **{k: read_dispatched(v, callback) for k, v in elem.items()} ) - elif elem_name.startswith("raw."): + elif elem_name.startswith("/raw"): return None elif elem_name in {"/obs", "/var"}: # override to only return AxisArray that will be accessed specially via our special AnnData object From 3cf70364eff60944e4187087dab16d50ca633c0e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Feb 2023 15:51:50 +0100 Subject: [PATCH 015/125] (chore): only use `backed` --- anndata/_io/specs/methods.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index caa34e0bc..7cf38d65b 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -90,7 +90,7 @@ def read_basic(elem, _reader): if isinstance(elem, Mapping): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: - return SparseDataset(elem).to_memory() + return SparseDataset(elem).to_backed() return {k: _reader.read_elem(v) for k, v in elem.items()} elif isinstance(elem, h5py.Dataset): return h5ad.read_dataset(elem) # TODO: Handle legacy @@ -110,7 +110,7 @@ def read_basic_zarr(elem, _reader): if isinstance(elem, Mapping): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: - return SparseDataset(elem).to_memory() + return SparseDataset(elem).to_backed() return {k: _reader.read_elem(v) for k, v in elem.items()} elif isinstance(elem, ZarrArray): return zarr.read_dataset(elem) # TODO: Handle legacy @@ -497,7 +497,7 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse(elem, _reader): - return SparseDataset(elem).to_memory() + return SparseDataset(elem).to_backed() @_REGISTRY.register_read_partial(H5Group, IOSpec("csc_matrix", "0.1.0")) @@ -688,7 +688,7 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem, _reader): +def read_categorical(elem, _reader): # TODO: Going to need a lazy version of this return pd.Categorical.from_codes( codes=_reader.read_elem(elem["codes"]), categories=_reader.read_elem(elem["categories"]), From d99dd56ff3a1811d3411a72e24b506361149b4a5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Feb 2023 15:59:55 +0100 Subject: [PATCH 016/125] (feat): add custom `to_df` method --- .../experimental/read_remote/read_remote.py | 75 +++++++++++-------- 1 file changed, 42 insertions(+), 33 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index ec9dd6d18..536022036 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -15,42 +15,48 @@ from ..._core import AnnData, AxisArrays from .utils import read_dispatched -class SingleDimensionAxisArraysRemote(AxisArrays): + +class AxisArraysRemote(AxisArrays): def __getitem__(self, key: str): - return self._data[key][()] - + return self._data[key] + def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation if hasattr(pd.DataFrame, __name): return self.to_df().__getattribute__(__name) return object.__getattribute__(self, __name) + def to_df(self) -> pd.DataFrame: + """Convert to pandas dataframe.""" + df = pd.DataFrame(index=self.dim_names) + for key in self.keys(): + value = self[key][()] + df[key] = value + return df class AnnDataRemote(AnnData): - def __init__( self, - X = None, - obs = None, - var = None, - uns = None, - obsm = None, - varm = None, - layers = None, - raw = None, - dtype = None, - shape = None, - filename = None, - filemode = None, - asview = False, + X=None, + obs=None, + var=None, + uns=None, + obsm=None, + varm=None, + layers=None, + raw=None, + dtype=None, + shape=None, + filename=None, + filemode=None, + asview=False, *, - obsp = None, - varp = None, - oidx = None, - vidx = None, + obsp=None, + varp=None, + oidx=None, + vidx=None, ): - # view attributes self._is_view = False self._adata_ref = None @@ -103,10 +109,14 @@ def __init__( raise ValueError("`shape` is inconsistent with `var`") # annotations - need names already for AxisArrays to work. - self.obs_names = pd.Index((obs['index'] if 'index' in obs else obs['_index'])[()]) - self.var_names = pd.Index((var['index'] if 'index' in var else var['_index'])[()]) - self._obs = AxisArrays(self, 0, vals=convert_to_dict(obs)) - self._var = AxisArrays(self, 1, vals=convert_to_dict(var)) + self.obs_names = pd.Index( + (obs["index"] if "index" in obs else obs["_index"])[()] + ) + self.var_names = pd.Index( + (var["index"] if "index" in var else var["_index"])[()] + ) + self._obs = AxisArraysRemote(self, 0, vals=convert_to_dict(obs)) + self._var = AxisArraysRemote(self, 1, vals=convert_to_dict(var)) # now we can verify if indices match! # for attr_name, x_name, idx in x_indices: @@ -120,8 +130,8 @@ def __init__( self.uns = uns or OrderedDict() # TODO: Think about consequences of making obsm a group in hdf - self._obsm = AxisArrays(self, 0, vals=convert_to_dict(obsm)) - self._varm = AxisArrays(self, 1, vals=convert_to_dict(varm)) + self._obsm = AxisArraysRemote(self, 0, vals=convert_to_dict(obsm)) + self._varm = AxisArraysRemote(self, 1, vals=convert_to_dict(varm)) self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp)) self._varp = PairwiseArrays(self, 1, vals=convert_to_dict(varp)) @@ -151,7 +161,6 @@ def __init__( # layers self._layers = Layers(self, layers) - def __eq__(self, other): """Equality testing""" raise NotImplementedError( @@ -181,11 +190,11 @@ def __delitem__(self, index: Index): @property def obs_names(self) -> pd.Index: return self._obs_names - + @property def var_names(self) -> pd.Index: return self._var_names - + @obs_names.setter def obs_names(self, names: Sequence[str]): self._obs_names = names @@ -648,7 +657,7 @@ def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: f = zarr.open_consolidated(store, mode="r") def callback(func, elem_name: str, elem, iospec): - if iospec.encoding_type == "anndata" or elem_name.endswith('/'): + if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnDataRemote( **{k: read_dispatched(v, callback) for k, v in elem.items()} ) @@ -661,4 +670,4 @@ def callback(func, elem_name: str, elem, iospec): adata = read_dispatched(f, callback=callback) - return adata \ No newline at end of file + return adata From 83aa3ab58c1d37b7449f2382007469bcb3bb1d6d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Feb 2023 16:40:02 +0100 Subject: [PATCH 017/125] (feat): get dataframe access working properly --- anndata/_io/specs/methods.py | 6 +----- anndata/experimental/read_remote/read_remote.py | 16 +++++++++++----- anndata/utils.py | 3 +++ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 7cf38d65b..46e3c20ff 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -689,11 +689,7 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) def read_categorical(elem, _reader): # TODO: Going to need a lazy version of this - return pd.Categorical.from_codes( - codes=_reader.read_elem(elem["codes"]), - categories=_reader.read_elem(elem["categories"]), - ordered=bool(_read_attr(elem.attrs, "ordered")), - ) + return elem @_REGISTRY.register_read_partial(H5Group, IOSpec("categorical", "0.2.0")) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 536022036..0dcf89f46 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -6,7 +6,8 @@ from anndata._core.file_backing import AnnDataFileManager from anndata._core.index import Index from anndata._core.raw import Raw -from anndata.compat import _move_adj_mtx +from anndata._io.specs.registry import read_elem +from anndata.compat import _move_adj_mtx, _read_attr from anndata.utils import convert_to_dict import zarr @@ -17,9 +18,6 @@ class AxisArraysRemote(AxisArrays): - def __getitem__(self, key: str): - return self._data[key] - def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation if hasattr(pd.DataFrame, __name): @@ -30,7 +28,15 @@ def to_df(self) -> pd.DataFrame: """Convert to pandas dataframe.""" df = pd.DataFrame(index=self.dim_names) for key in self.keys(): - value = self[key][()] + z = self[key] + if isinstance(z, zarr.Group): + value = pd.Categorical.from_codes( + codes=read_elem(z["codes"]), + categories=read_elem(z["categories"]), + ordered=bool(_read_attr(z.attrs, "ordered")), + ) + else: + value = z[()] df[key] = value return df diff --git a/anndata/utils.py b/anndata/utils.py index 2ab0a6b36..d0e820cee 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -6,6 +6,7 @@ import pandas as pd import numpy as np from scipy import sparse +import zarr from .logging import get_logger from ._core.sparse_dataset import SparseDataset @@ -66,6 +67,8 @@ def dim_len(x, axis): Returns None if `x` is an awkward array with variable length in the requested dimension. """ + if isinstance(x, zarr.Group): + return x["codes"].shape[axis] return x.shape[axis] From 66c86fe3c50b0e47a993271ed9f4aaccc5d933d3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Feb 2023 16:41:54 +0100 Subject: [PATCH 018/125] (chore): remove TODO --- anndata/_io/specs/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 46e3c20ff..2bcee4882 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -688,7 +688,7 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) -def read_categorical(elem, _reader): # TODO: Going to need a lazy version of this +def read_categorical(elem, _reader): return elem From fca6fe5fc47f2392b496cda15115abd1e5ef86ea Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Feb 2023 16:46:35 +0100 Subject: [PATCH 019/125] (chore): write up to-do's --- anndata/experimental/read_remote/read_remote.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 0dcf89f46..6f1f3b2a4 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -42,6 +42,12 @@ def to_df(self) -> pd.DataFrame: class AnnDataRemote(AnnData): + # TODO's here: + # 1. Get an in-place copying system running + # 2. Get a better sparse access pattern + # 3. Re-write dataset with better chunking + # 4. Custom Zarr Group for categorical data? + def __init__( self, X=None, From 2f31f91c90a8e837f607b37bf17b8f24b67dcbb1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Feb 2023 16:51:47 +0100 Subject: [PATCH 020/125] (chore): add head method --- anndata/experimental/read_remote/read_remote.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 6f1f3b2a4..4716a80c8 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -47,6 +47,7 @@ class AnnDataRemote(AnnData): # 2. Get a better sparse access pattern # 3. Re-write dataset with better chunking # 4. Custom Zarr Group for categorical data? + # 5. a `head` method def __init__( self, From 2fad98e9c010f34891adce3a814174694d245588 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Feb 2023 10:33:56 +0100 Subject: [PATCH 021/125] (chore): add better check for `to_df` --- anndata/experimental/read_remote/read_remote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 4716a80c8..88b4de48e 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -29,7 +29,7 @@ def to_df(self) -> pd.DataFrame: df = pd.DataFrame(index=self.dim_names) for key in self.keys(): z = self[key] - if isinstance(z, zarr.Group): + if isinstance(z, zarr.Group) and "codes" in z: # catrgoricql value = pd.Categorical.from_codes( codes=read_elem(z["codes"]), categories=read_elem(z["categories"]), From c07e71a91a25e6156b3f744bac9ed4ef5822823e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 27 Feb 2023 14:05:45 +0100 Subject: [PATCH 022/125] (feat): categorical zarr array. --- .../experimental/read_remote/read_remote.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 88b4de48e..8d9df9876 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -17,6 +17,34 @@ from .utils import read_dispatched +# Initialization from something like +# CategoricalZarrArray(adata_local.obs['cell_type']['codes'].store, 'obs/cell_type/codes') +# Will need to work out the API better once I understand what the best practice here is. +class CategoricalZarrArray(zarr.core.Array): + def __init__(self, store, path, *args, **kwargs): + super().__init__(store.store, path, *args, **kwargs) + root_path = (store.store.path + "/" + path).replace("/codes", "") + self.categories = zarr.open(root_path + "/categories")[()] + root = zarr.open(root_path) + self.ordered = bool( + _read_attr(root.attrs, "ordered") if "ordered" in root.attrs else False + ) + + def __array__(self, *args): # may need to override this, copied for now + a = self[...] + if args: + a = a.astype(args[0]) + return a + + def __getitem__(self, selection): + result = super().__getitem__(selection) + return pd.Categorical.from_codes( + codes=result, + categories=self.categories, + ordered=self.ordered, + ) + + class AxisArraysRemote(AxisArrays): def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation From a785310ce3dca26bf585056d97f99b6a34bd2e25 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 27 Feb 2023 09:41:47 +0000 Subject: [PATCH 023/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/__init__.py | 2 +- anndata/experimental/read_remote/__init__.py | 2 +- anndata/experimental/read_remote/utils.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/anndata/_core/__init__.py b/anndata/_core/__init__.py index 31018446e..1fe157178 100644 --- a/anndata/_core/__init__.py +++ b/anndata/_core/__init__.py @@ -1,2 +1,2 @@ from .anndata import AnnData -from .aligned_mapping import AxisArrays \ No newline at end of file +from .aligned_mapping import AxisArrays diff --git a/anndata/experimental/read_remote/__init__.py b/anndata/experimental/read_remote/__init__.py index 3d20b9dc1..67597aa49 100644 --- a/anndata/experimental/read_remote/__init__.py +++ b/anndata/experimental/read_remote/__init__.py @@ -1 +1 @@ -from .utils import read_dispatched, write_dispatched \ No newline at end of file +from .utils import read_dispatched, write_dispatched diff --git a/anndata/experimental/read_remote/utils.py b/anndata/experimental/read_remote/utils.py index 79d801ef6..1d62956bf 100644 --- a/anndata/experimental/read_remote/utils.py +++ b/anndata/experimental/read_remote/utils.py @@ -2,6 +2,7 @@ from anndata._io.specs import read_elem, write_elem + def read_dispatched(store, callback): from anndata._io.specs import Reader, _REGISTRY @@ -15,4 +16,4 @@ def write_dispatched(store, key, elem, callback, dataset_kwargs=MappingProxyType writer = Writer(_REGISTRY, callback=callback) - writer.write_elem(store, key, elem, dataset_kwargs=dataset_kwargs) \ No newline at end of file + writer.write_elem(store, key, elem, dataset_kwargs=dataset_kwargs) From 85a900645a76cfe6efca0da221d00919cb31a028 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 1 Mar 2023 10:37:50 +0100 Subject: [PATCH 024/125] (feat): add categorical array to the `read_remote` --- anndata/experimental/read_remote/read_remote.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 8d9df9876..f723d3ab7 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -21,14 +21,11 @@ # CategoricalZarrArray(adata_local.obs['cell_type']['codes'].store, 'obs/cell_type/codes') # Will need to work out the API better once I understand what the best practice here is. class CategoricalZarrArray(zarr.core.Array): - def __init__(self, store, path, *args, **kwargs): - super().__init__(store.store, path, *args, **kwargs) - root_path = (store.store.path + "/" + path).replace("/codes", "") - self.categories = zarr.open(root_path + "/categories")[()] - root = zarr.open(root_path) - self.ordered = bool( - _read_attr(root.attrs, "ordered") if "ordered" in root.attrs else False - ) + def __init__(self, group, *args, **kwargs): + codes_path = group.path + "/codes" + super().__init__(group.store.store, codes_path, *args, **kwargs) + self.categories = group["categories"][()] + self.ordered = bool(_read_attr(group.attrs, "ordered")) def __array__(self, *args): # may need to override this, copied for now a = self[...] @@ -707,6 +704,8 @@ def callback(func, elem_name: str, elem, iospec): elif elem_name in {"/obs", "/var"}: # override to only return AxisArray that will be accessed specially via our special AnnData object return {k: read_dispatched(v, callback) for k, v in elem.items()} + elif iospec.encoding_type == "categorical": + return CategoricalZarrArray(elem) return func(elem) adata = read_dispatched(f, callback=callback) From 568241acd1592dcfcdf3f8ad507431304531ca8e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 1 Mar 2023 10:38:09 +0100 Subject: [PATCH 025/125] (chore): remove todo --- anndata/experimental/read_remote/read_remote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index f723d3ab7..419489520 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -71,7 +71,6 @@ class AnnDataRemote(AnnData): # 1. Get an in-place copying system running # 2. Get a better sparse access pattern # 3. Re-write dataset with better chunking - # 4. Custom Zarr Group for categorical data? # 5. a `head` method def __init__( From a5bd7dcdebf52e2220cfd5f8b80c5c639c6fa1cd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 1 Mar 2023 10:42:55 +0100 Subject: [PATCH 026/125] (chore): remove commented out parts --- .../experimental/read_remote/read_remote.py | 446 ------------------ 1 file changed, 446 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 419489520..e6de4c41d 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -240,452 +240,6 @@ def obs_names(self, names: Sequence[str]): def var_names(self, names: Sequence[str]): self._var_names = names - # def obs_vector(self, k: str, *, layer: Optional[str] = None) -> np.ndarray: - # """\ - # Convenience function for returning a 1 dimensional ndarray of values - # from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. - - # Made for convenience, not performance. - # Intentionally permissive about arguments, for easy iterative use. - - # Params - # ------ - # k - # Key to use. Should be in :attr:`var_names` or :attr:`obs`\\ `.columns`. - # layer - # What layer values should be returned from. If `None`, :attr:`X` is used. - - # Returns - # ------- - # A one dimensional nd array, with values for each obs in the same order - # as :attr:`obs_names`. - # """ - # if layer == "X": - # if "X" in self.layers: - # pass - # else: - # warnings.warn( - # "In a future version of AnnData, access to `.X` by passing" - # " `layer='X'` will be removed. Instead pass `layer=None`.", - # FutureWarning, - # ) - # layer = None - # return get_vector(self, k, "obs", "var", layer=layer) - - # def var_vector(self, k, *, layer: Optional[str] = None) -> np.ndarray: - # """\ - # Convenience function for returning a 1 dimensional ndarray of values - # from :attr:`X`, :attr:`layers`\\ `[k]`, or :attr:`obs`. - - # Made for convenience, not performance. Intentionally permissive about - # arguments, for easy iterative use. - - # Params - # ------ - # k - # Key to use. Should be in :attr:`obs_names` or :attr:`var`\\ `.columns`. - # layer - # What layer values should be returned from. If `None`, :attr:`X` is used. - - # Returns - # ------- - # A one dimensional nd array, with values for each var in the same order - # as :attr:`var_names`. - # """ - # if layer == "X": - # if "X" in self.layers: - # pass - # else: - # warnings.warn( - # "In a future version of AnnData, access to `.X` by passing " - # "`layer='X'` will be removed. Instead pass `layer=None`.", - # FutureWarning, - # ) - # layer = None - # # return get_vector(self, k, "var", "obs", layer=layer) - - # def to_memory(self, copy=True) -> "AnnData": - # """Return a new AnnData object with all backed arrays loaded into memory. - - # Params - # ------ - # copy: - # Whether the arrays that are already in-memory should be copied. - - # Example - # ------- - - # .. code:: python - - # import anndata - # backed = anndata.read_h5ad("file.h5ad", backed="r") - # mem = backed[backed.obs["cluster"] == "a", :].to_memory() - # """ - # new = {} - # for attr_name in [ - # "X", - # "obs", - # "var", - # "obsm", - # "varm", - # "obsp", - # "varp", - # "layers", - # "uns", - # ]: - # attr = getattr(self, attr_name, None) - # if attr is not None: - # new[attr_name] = to_memory(attr, copy) - - # if self.raw is not None: - # new["raw"] = { - # "X": to_memory(self.raw.X, copy), - # "var": to_memory(self.raw.var, copy), - # "varm": to_memory(self.raw.varm, copy), - # } - - # if self.isbacked: - # self.file.close() - - # # return AnnData(**new) - - # def concatenate( - # self, - # *adatas: "AnnData", - # join: str = "inner", - # batch_key: str = "batch", - # batch_categories: Sequence[Any] = None, - # uns_merge: Optional[str] = None, - # index_unique: Optional[str] = "-", - # fill_value=None, - # ) -> "AnnData": - # """\ - # Concatenate along the observations axis. - - # The :attr:`uns`, :attr:`varm` and :attr:`obsm` attributes are ignored. - - # Currently, this works only in `'memory'` mode. - - # .. note:: - - # For more flexible and efficient concatenation, see: :func:`~anndata.concat`. - - # Parameters - # ---------- - # adatas - # AnnData matrices to concatenate with. Each matrix is referred to as - # a “batch”. - # join - # Use intersection (`'inner'`) or union (`'outer'`) of variables. - # batch_key - # Add the batch annotation to :attr:`obs` using this key. - # batch_categories - # Use these as categories for the batch annotation. By default, use increasing numbers. - # uns_merge - # Strategy to use for merging entries of uns. These strategies are applied recusivley. - # Currently implemented strategies include: - - # * `None`: The default. The concatenated object will just have an empty dict for `uns`. - # * `"same"`: Only entries which have the same value in all AnnData objects are kept. - # * `"unique"`: Only entries which have one unique value in all AnnData objects are kept. - # * `"first"`: The first non-missing value is used. - # * `"only"`: A value is included if only one of the AnnData objects has a value at this - # path. - # index_unique - # Make the index unique by joining the existing index names with the - # batch category, using `index_unique='-'`, for instance. Provide - # `None` to keep existing indices. - # fill_value - # Scalar value to fill newly missing values in arrays with. Note: only applies to arrays - # and sparse matrices (not dataframes) and will only be used if `join="outer"`. - - # .. note:: - # If not provided, the default value is `0` for sparse matrices and `np.nan` - # for numpy arrays. See the examples below for more information. - - # Returns - # ------- - # :class:`~anndata.AnnData` - # The concatenated :class:`~anndata.AnnData`, where `adata.obs[batch_key]` - # stores a categorical variable labeling the batch. - - # Notes - # ----- - - # .. warning:: - - # If you use `join='outer'` this fills 0s for sparse data when - # variables are absent in a batch. Use this with care. Dense data is - # filled with `NaN`. See the examples. - - # Examples - # -------- - # Joining on intersection of variables. - - # >>> adata1 = AnnData( - # ... np.array([[1, 2, 3], [4, 5, 6]]), - # ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), - # ... dict(var_names=['a', 'b', 'c'], annoA=[0, 1, 2]), - # ... ) - # >>> adata2 = AnnData( - # ... np.array([[1, 2, 3], [4, 5, 6]]), - # ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), - # ... dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]), - # ... ) - # >>> adata3 = AnnData( - # ... np.array([[1, 2, 3], [4, 5, 6]]), - # ... dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']), - # ... dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]), - # ... ) - # >>> adata = adata1.concatenate(adata2, adata3) - # >>> adata - # AnnData object with n_obs × n_vars = 6 × 2 - # obs: 'anno1', 'anno2', 'batch' - # var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' - # >>> adata.X - # array([[2, 3], - # [5, 6], - # [3, 2], - # [6, 5], - # [3, 2], - # [6, 5]]) - # >>> adata.obs - # anno1 anno2 batch - # s1-0 c1 NaN 0 - # s2-0 c2 NaN 0 - # s3-1 c3 NaN 1 - # s4-1 c4 NaN 1 - # s1-2 NaN d3 2 - # s2-2 NaN d4 2 - # >>> adata.var.T - # b c - # annoA-0 1 2 - # annoA-1 2 1 - # annoA-2 3 2 - # annoB-2 2 1 - - # Joining on the union of variables. - - # >>> outer = adata1.concatenate(adata2, adata3, join='outer') - # >>> outer - # AnnData object with n_obs × n_vars = 6 × 4 - # obs: 'anno1', 'anno2', 'batch' - # var: 'annoA-0', 'annoA-1', 'annoA-2', 'annoB-2' - # >>> outer.var.T - # a b c d - # annoA-0 0.0 1.0 2.0 NaN - # annoA-1 NaN 2.0 1.0 0.0 - # annoA-2 NaN 3.0 2.0 0.0 - # annoB-2 NaN 2.0 1.0 0.0 - # >>> outer.var_names - # Index(['a', 'b', 'c', 'd'], dtype='object') - # >>> outer.X - # array([[ 1., 2., 3., nan], - # [ 4., 5., 6., nan], - # [nan, 3., 2., 1.], - # [nan, 6., 5., 4.], - # [nan, 3., 2., 1.], - # [nan, 6., 5., 4.]]) - # >>> outer.X.sum(axis=0) - # array([nan, 25., 23., nan]) - # >>> import pandas as pd - # >>> Xdf = pd.DataFrame(outer.X, columns=outer.var_names) - # >>> Xdf - # a b c d - # 0 1.0 2.0 3.0 NaN - # 1 4.0 5.0 6.0 NaN - # 2 NaN 3.0 2.0 1.0 - # 3 NaN 6.0 5.0 4.0 - # 4 NaN 3.0 2.0 1.0 - # 5 NaN 6.0 5.0 4.0 - # >>> Xdf.sum() - # a 5.0 - # b 25.0 - # c 23.0 - # d 10.0 - # dtype: float64 - - # One way to deal with missing values is to use masked arrays: - - # >>> from numpy import ma - # >>> outer.X = ma.masked_invalid(outer.X) - # >>> outer.X - # masked_array( - # data=[[1.0, 2.0, 3.0, --], - # [4.0, 5.0, 6.0, --], - # [--, 3.0, 2.0, 1.0], - # [--, 6.0, 5.0, 4.0], - # [--, 3.0, 2.0, 1.0], - # [--, 6.0, 5.0, 4.0]], - # mask=[[False, False, False, True], - # [False, False, False, True], - # [ True, False, False, False], - # [ True, False, False, False], - # [ True, False, False, False], - # [ True, False, False, False]], - # fill_value=1e+20) - # >>> outer.X.sum(axis=0).data - # array([ 5., 25., 23., 10.]) - - # The masked array is not saved but has to be reinstantiated after saving. - - # >>> outer.write('./test.h5ad') - # >>> from anndata import read_h5ad - # >>> outer = read_h5ad('./test.h5ad') - # >>> outer.X - # array([[ 1., 2., 3., nan], - # [ 4., 5., 6., nan], - # [nan, 3., 2., 1.], - # [nan, 6., 5., 4.], - # [nan, 3., 2., 1.], - # [nan, 6., 5., 4.]]) - - # For sparse data, everything behaves similarly, - # except that for `join='outer'`, zeros are added. - - # >>> from scipy.sparse import csr_matrix - # >>> adata1 = AnnData( - # ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), - # ... dict(obs_names=['s1', 's2'], anno1=['c1', 'c2']), - # ... dict(var_names=['a', 'b', 'c']), - # ... ) - # >>> adata2 = AnnData( - # ... csr_matrix([[0, 2, 3], [0, 5, 6]], dtype=np.float32), - # ... dict(obs_names=['s3', 's4'], anno1=['c3', 'c4']), - # ... dict(var_names=['d', 'c', 'b']), - # ... ) - # >>> adata3 = AnnData( - # ... csr_matrix([[1, 2, 0], [0, 5, 6]], dtype=np.float32), - # ... dict(obs_names=['s5', 's6'], anno2=['d3', 'd4']), - # ... dict(var_names=['d', 'c', 'b']), - # ... ) - # >>> adata = adata1.concatenate(adata2, adata3, join='outer') - # >>> adata.var_names - # Index(['a', 'b', 'c', 'd'], dtype='object') - # >>> adata.X.toarray() - # array([[0., 2., 3., 0.], - # [0., 5., 6., 0.], - # [0., 3., 2., 0.], - # [0., 6., 5., 0.], - # [0., 0., 2., 1.], - # [0., 6., 5., 0.]], dtype=float32) - # """ - # from .merge import concat, merge_outer, merge_dataframes, merge_same - - # warnings.warn( - # "The AnnData.concatenate method is deprecated in favour of the " - # "anndata.concat function. Please use anndata.concat instead.\n\n" - # "See the tutorial for concat at: " - # "https://anndata.readthedocs.io/en/latest/concatenation.html", - # FutureWarning, - # ) - - # if self.isbacked: - # raise ValueError("Currently, concatenate only works in memory mode.") - - # if len(adatas) == 0: - # return self.copy() - # elif len(adatas) == 1 and not isinstance(adatas[0], AnnData): - # adatas = adatas[0] # backwards compatibility - # all_adatas = (self,) + tuple(adatas) - - # out = concat( - # all_adatas, - # axis=0, - # join=join, - # label=batch_key, - # keys=batch_categories, - # uns_merge=uns_merge, - # fill_value=fill_value, - # index_unique=index_unique, - # pairwise=False, - # ) - - # # Backwards compat (some of this could be more efficient) - # # obs used to always be an outer join - # out.obs = concat( - # [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas], - # axis=0, - # join="outer", - # label=batch_key, - # keys=batch_categories, - # index_unique=index_unique, - # ).obs - # # Removing varm - # del out.varm - # # Implementing old-style merging of var - # if batch_categories is None: - # batch_categories = np.arange(len(all_adatas)).astype(str) - # pat = rf"-({'|'.join(batch_categories)})$" - # out.var = merge_dataframes( - # [a.var for a in all_adatas], - # out.var_names, - # partial(merge_outer, batch_keys=batch_categories, merge=merge_same), - # ) - # out.var = out.var.iloc[ - # :, - # ( - # out.var.columns.str.extract(pat, expand=False) - # .fillna("") - # .argsort(kind="stable") - # ), - # ] - - # return out - - # def var_names_make_unique(self, join: str = "-"): - # # Important to go through the setter so obsm dataframes are updated too - # self.var_names = utils.make_index_unique(self.var.index, join) - - # var_names_make_unique.__doc__ = utils.make_index_unique.__doc__ - - # def obs_names_make_unique(self, join: str = "-"): - # # Important to go through the setter so obsm dataframes are updated too - # self.obs_names = utils.make_index_unique(self.obs.index, join) - - # obs_names_make_unique.__doc__ = utils.make_index_unique.__doc__ - - def __contains__(self, key: Any): - raise AttributeError( - "AnnData has no attribute __contains__, don;t check `in adata`." - ) - - # def _check_dimensions(self, key=None): - # if key is None: - # key = {"obs", "var", "obsm", "varm"} - # else: - # key = {key} - # if "obs" in key and len(self._obs) != self._n_obs: - # raise ValueError( - # "Observations annot. `obs` must have number of rows of `X`" - # f" ({self._n_obs}), but has {self._obs.shape[0]} rows." - # ) - # if "var" in key and len(self._var) != self._n_vars: - # raise ValueError( - # "Variables annot. `var` must have number of columns of `X`" - # f" ({self._n_vars}), but has {self._var.shape[0]} rows." - # ) - # if "obsm" in key: - # obsm = self._obsm - # if ( - # not all([o.shape[0] == self._n_obs for o in obsm.values()]) - # and len(obsm.dim_names) != self._n_obs - # ): - # raise ValueError( - # "Observations annot. `obsm` must have number of rows of `X`" - # f" ({self._n_obs}), but has {len(obsm)} rows." - # ) - # if "varm" in key: - # varm = self._varm - # if ( - # not all([v.shape[0] == self._n_vars for v in varm.values()]) - # and len(varm.dim_names) != self._n_vars - # ): - # raise ValueError( - # "Variables annot. `varm` must have number of columns of `X`" - # f" ({self._n_vars}), but has {len(varm)} rows." - # ) - def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: if isinstance(store, Path): From c1b090cb998be3ab541ce012f116d9ddb41ad071 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 1 Mar 2023 10:49:32 +0100 Subject: [PATCH 027/125] (chore): remove more unused methods --- anndata/experimental/read_remote/read_remote.py | 17 +---------------- 1 file changed, 1 insertion(+), 16 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index e6de4c41d..87e62510b 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -198,22 +198,7 @@ def __init__( # layers self._layers = Layers(self, layers) - def __eq__(self, other): - """Equality testing""" - raise NotImplementedError( - "Equality comparisons are not supported for AnnData objects, " - "instead compare the desired attributes." - ) - - def obs_keys(self) -> List[str]: - """List keys of observation annotation :attr:`obs`.""" - return self._obs.keys() - - def var_keys(self) -> List[str]: - """List keys of variable annotation :attr:`var`.""" - return self._var.keys() - - # TODO: this is not quite complete... + # TODO: this is not quite complete in the original but also here, what do we do about this? def __delitem__(self, index: Index): obs, var = self._normalize_indices(index) # TODO: does this really work? From ef3dd22029d6be85bc614fe8adfac5cf61c6afe7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 1 Mar 2023 10:59:22 +0100 Subject: [PATCH 028/125] (chore): more cleanup --- anndata/_io/specs/methods.py | 8 ++++++-- anndata/experimental/read_remote/read_remote.py | 2 ++ anndata/utils.py | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 2bcee4882..ef4766e7c 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -320,7 +320,7 @@ def write_basic_dask(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(ZarrArray, IOSpec("array", "0.2.0")) @_REGISTRY.register_read(ZarrArray, IOSpec("string-array", "0.2.0")) def read_array(elem, _reader): - return elem + return elem[()] @_REGISTRY.register_read_partial(H5Array, IOSpec("array", "0.2.0")) @@ -689,7 +689,11 @@ def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("categorical", "0.2.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("categorical", "0.2.0")) def read_categorical(elem, _reader): - return elem + return pd.Categorical.from_codes( + codes=_reader.read_elem(elem["codes"]), + categories=_reader.read_elem(elem["categories"]), + ordered=bool(_read_attr(elem.attrs, "ordered")), + ) @_REGISTRY.register_read_partial(H5Group, IOSpec("categorical", "0.2.0")) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 87e62510b..a6565a917 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -244,6 +244,8 @@ def callback(func, elem_name: str, elem, iospec): return {k: read_dispatched(v, callback) for k, v in elem.items()} elif iospec.encoding_type == "categorical": return CategoricalZarrArray(elem) + elif iospec.encoding_type in {"array", "string_array"}: + return elem return func(elem) adata = read_dispatched(f, callback=callback) diff --git a/anndata/utils.py b/anndata/utils.py index d0e820cee..7fa35ca1d 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -8,6 +8,8 @@ from scipy import sparse import zarr +from anndata.experimental.read_remote.read_remote import CategoricalZarrArray + from .logging import get_logger from ._core.sparse_dataset import SparseDataset @@ -67,8 +69,6 @@ def dim_len(x, axis): Returns None if `x` is an awkward array with variable length in the requested dimension. """ - if isinstance(x, zarr.Group): - return x["codes"].shape[axis] return x.shape[axis] From 3b9b83899ac69fdc2d98c21e4d3ab5e66e6b2da9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 1 Mar 2023 11:00:49 +0100 Subject: [PATCH 029/125] (chore): remove unused imports from `utils` --- anndata/utils.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/anndata/utils.py b/anndata/utils.py index 7fa35ca1d..2ab0a6b36 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -6,9 +6,6 @@ import pandas as pd import numpy as np from scipy import sparse -import zarr - -from anndata.experimental.read_remote.read_remote import CategoricalZarrArray from .logging import get_logger from ._core.sparse_dataset import SparseDataset From 1827e26f584c3e487042e8b7a7c4cd88d4382a00 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 1 Mar 2023 11:15:46 +0100 Subject: [PATCH 030/125] (chore): refactor to use `cached_property` --- .../experimental/read_remote/read_remote.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index a6565a917..4c87acfc3 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -1,4 +1,5 @@ from collections import OrderedDict, abc as cabc +from functools import cached_property from pathlib import Path from typing import Any, MutableMapping, Union, List, Sequence from anndata._core.aligned_mapping import Layers, PairwiseArrays @@ -17,15 +18,23 @@ from .utils import read_dispatched -# Initialization from something like -# CategoricalZarrArray(adata_local.obs['cell_type']['codes'].store, 'obs/cell_type/codes') -# Will need to work out the API better once I understand what the best practice here is. +# TODO: Do we really need to subclass the Array class here? Ryan Abernathy seems to say "no" +# but I don't really want to mess with the methods. The downside is that (for some reason), it's +# reading the `zarray` of the `codes` path which should not have to happen, but I can't figure out a way around it. class CategoricalZarrArray(zarr.core.Array): def __init__(self, group, *args, **kwargs): codes_path = group.path + "/codes" super().__init__(group.store.store, codes_path, *args, **kwargs) - self.categories = group["categories"][()] - self.ordered = bool(_read_attr(group.attrs, "ordered")) + self._categories = group["categories"] + self._group_attrs = group.attrs + + @cached_property + def categories(self): + return self._categories[()] + + @cached_property + def ordered(self): + return bool(_read_attr(self._group_attrs, "ordered")) def __array__(self, *args): # may need to override this, copied for now a = self[...] From 4c5dcbe3cd64f4b98492ac7ab84010d6906bae9a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 3 Mar 2023 11:33:20 +0100 Subject: [PATCH 031/125] (chore): more rebase cleanup --- anndata/_io/utils.py | 50 -------------------- anndata/experimental/read_remote/__init__.py | 2 +- anndata/experimental/read_remote/utils.py | 19 -------- 3 files changed, 1 insertion(+), 70 deletions(-) delete mode 100644 anndata/experimental/read_remote/utils.py diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py index d4c2ba535..388f9f050 100644 --- a/anndata/_io/utils.py +++ b/anndata/_io/utils.py @@ -203,33 +203,6 @@ def func_wrapper(*args, **kwargs): except Exception as e: re_raise_error(e, elem) - if ismethod(func): - - @wraps(func) - def method_wrapper(self, elem, *args, **kwargs): - try: - return func(self, elem, *args, **kwargs) - except Exception as e: - re_raise_error(e, elem) - - return method_wrapper - - else: - # TODO: sometimes, something that looks an awful lot like a method is reaching here - # It's being passed an instance of a Reader, has a signature Reader.read_elem, but is a function - @wraps(func) - def func_wrapper(*args, **kwargs): - from anndata._io.specs import Reader - - # Figure out signature (method vs function) by going through args - for elem in args: - if not isinstance(elem, Reader): - break - try: - return func(*args, **kwargs) - except Exception as e: - re_raise_error(e, elem) - return func_wrapper @@ -248,29 +221,6 @@ def report_write_key_on_error(func): >>> write_arr(z, "X", X) # doctest: +SKIP """ - def re_raise_error(e, elem, key): - if "Above error raised while writing key" in format(e): - raise - else: - parent = _get_parent(elem) - raise type(e)( - f"{e}\n\n" - f"Above error raised while writing key {key!r} of {type(elem)} " - f"to {parent}" - ) from e - - # Need to specialize for method signature - if ismethod(func): - - @wraps(func) - def func_wrapper(self, elem, key, val, *args, **kwargs): - try: - return func(self, elem, key, val, *args, **kwargs) - except Exception as e: - re_raise_error(e, elem, key) - - else: - def re_raise_error(e, elem, key): if "Above error raised while writing key" in format(e): raise diff --git a/anndata/experimental/read_remote/__init__.py b/anndata/experimental/read_remote/__init__.py index 67597aa49..5cce3164a 100644 --- a/anndata/experimental/read_remote/__init__.py +++ b/anndata/experimental/read_remote/__init__.py @@ -1 +1 @@ -from .utils import read_dispatched, write_dispatched +from read_remote import read_remote diff --git a/anndata/experimental/read_remote/utils.py b/anndata/experimental/read_remote/utils.py deleted file mode 100644 index 1d62956bf..000000000 --- a/anndata/experimental/read_remote/utils.py +++ /dev/null @@ -1,19 +0,0 @@ -from types import MappingProxyType - -from anndata._io.specs import read_elem, write_elem - - -def read_dispatched(store, callback): - from anndata._io.specs import Reader, _REGISTRY - - reader = Reader(_REGISTRY, callback=callback) - - return reader.read_elem(store) - - -def write_dispatched(store, key, elem, callback, dataset_kwargs=MappingProxyType({})): - from anndata._io.specs import Writer, _REGISTRY - - writer = Writer(_REGISTRY, callback=callback) - - writer.write_elem(store, key, elem, dataset_kwargs=dataset_kwargs) From 009a4265f70a3f7b4cb0f21469b8a9c77d97b3e5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 3 Mar 2023 12:25:52 +0100 Subject: [PATCH 032/125] (fix): correct imports --- anndata/experimental/read_remote/__init__.py | 2 +- anndata/experimental/read_remote/read_remote.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_remote/__init__.py b/anndata/experimental/read_remote/__init__.py index 5cce3164a..1622afcb5 100644 --- a/anndata/experimental/read_remote/__init__.py +++ b/anndata/experimental/read_remote/__init__.py @@ -1 +1 @@ -from read_remote import read_remote +from .read_remote import read_remote diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 4c87acfc3..600cc5ee6 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -15,7 +15,7 @@ import pandas as pd from ..._core import AnnData, AxisArrays -from .utils import read_dispatched +from .. import read_dispatched # TODO: Do we really need to subclass the Array class here? Ryan Abernathy seems to say "no" From 289447e8a171f4e8e93ff10e03cb02f6197f48b2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 3 Mar 2023 12:51:41 +0100 Subject: [PATCH 033/125] (feat): begin base `AnnData` class --- anndata/_core/anndata.py | 8 +--- anndata/_core/anndata_base.py | 83 +++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+), 6 deletions(-) create mode 100644 anndata/_core/anndata_base.py diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index e845b045d..ab8ddee9a 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -28,6 +28,7 @@ from .index import _normalize_indices, _subset, Index, Index1D, get_vector from .file_backing import AnnDataFileManager, to_memory from .access import ElementRef +from .anndata_base import AnnDataBase from .aligned_mapping import ( AxisArrays, AxisArraysView, @@ -125,7 +126,7 @@ def _(anno, length, index_names): raise ValueError(f"Cannot convert {type(anno)} to DataFrame") -class AnnData(metaclass=utils.DeprecationMixinMeta): +class AnnData(AnnDataBase): """\ An annotated data matrix. @@ -581,11 +582,6 @@ def __eq__(self, other): "instead compare the desired attributes." ) - @property - def shape(self) -> Tuple[int, int]: - """Shape of data matrix (:attr:`n_obs`, :attr:`n_vars`).""" - return self.n_obs, self.n_vars - @property def X(self) -> Optional[Union[np.ndarray, sparse.spmatrix, ArrayView]]: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" diff --git a/anndata/_core/anndata_base.py b/anndata/_core/anndata_base.py new file mode 100644 index 000000000..7a17faa9a --- /dev/null +++ b/anndata/_core/anndata_base.py @@ -0,0 +1,83 @@ +from abc import abstractmethod +from typing import Tuple +from ..utils import DeprecationMixinMeta + + +class AnnDataBase(metaclass=DeprecationMixinMeta): + @abstractmethod + def _init_as_actual(self, *args, **kwargs): + pass + + @abstractmethod + def _init_as_view(self, *args, **kwargs): + pass + + @property + @abstractmethod + def X(self): + pass + + @property + @abstractmethod + def obs(self): + pass + + @property + @abstractmethod + def obsm(self): + pass + + @property + @abstractmethod + def obsp(self): + pass + + @property + @abstractmethod + def var(self): + pass + + @property + @abstractmethod + def uns(self): + pass + + @property + @abstractmethod + def varm(self): + pass + + @property + @abstractmethod + def varp(self): + pass + + @property + @abstractmethod + def n_obs(self) -> int: + pass + + @property + @abstractmethod + def obs_names(self): + pass + + @property + @abstractmethod + def n_vars(self) -> int: + pass + + @property + @abstractmethod + def var_names(self): + pass + + @property + @abstractmethod + def is_view(self) -> bool: + pass + + @property + def shape(self) -> Tuple[int, int]: + """Shape of data matrix (:attr:`n_obs`, :attr:`n_vars`).""" + return self.n_obs, self.n_vars From 796839f426c0411e3ecdb89d9fa1d46b4ee3ebf1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 7 Mar 2023 11:10:38 +0100 Subject: [PATCH 034/125] (feat): being in-place view mechanism --- anndata/_core/anndata.py | 2 +- .../experimental/read_remote/read_remote.py | 90 +++++++++++++++++-- 2 files changed, 83 insertions(+), 9 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index ab8ddee9a..0cfdb8548 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -279,7 +279,7 @@ def __init__( vidx: Index1D = None, ): if asview: - if not isinstance(X, AnnData): + if not issubclass(type(X), AnnData): raise ValueError("`X` has to be an AnnData object.") self._init_as_view(X, oidx, vidx) else: diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 600cc5ee6..6e151ad7e 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -1,4 +1,5 @@ from collections import OrderedDict, abc as cabc +from copy import copy from functools import cached_property from pathlib import Path from typing import Any, MutableMapping, Union, List, Sequence @@ -7,12 +8,14 @@ from anndata._core.file_backing import AnnDataFileManager from anndata._core.index import Index from anndata._core.raw import Raw +from anndata._core.views import _resolve_idxs from anndata._io.specs.registry import read_elem from anndata.compat import _move_adj_mtx, _read_attr from anndata.utils import convert_to_dict import zarr import pandas as pd +import numpy as np from ..._core import AnnData, AxisArrays from .. import read_dispatched @@ -74,6 +77,14 @@ def to_df(self) -> pd.DataFrame: df[key] = value return df + @property + def iloc(self): + class IlocDispatch: + def __getitem__(self_iloc, idx): + return self._view(self.parent, (idx,)) + + return IlocDispatch() + class AnnDataRemote(AnnData): # TODO's here: @@ -82,7 +93,7 @@ class AnnDataRemote(AnnData): # 3. Re-write dataset with better chunking # 5. a `head` method - def __init__( + def _init_as_actual( self, X=None, obs=None, @@ -90,18 +101,14 @@ def __init__( uns=None, obsm=None, varm=None, - layers=None, + varp=None, + obsp=None, raw=None, + layers=None, dtype=None, shape=None, filename=None, filemode=None, - asview=False, - *, - obsp=None, - varp=None, - oidx=None, - vidx=None, ): # view attributes self._is_view = False @@ -207,6 +214,68 @@ def __init__( # layers self._layers = Layers(self, layers) + def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): + if adata_ref.isbacked and adata_ref.is_view: + raise ValueError( + "Currently, you cannot index repeatedly into a backed AnnData, " + "that is, you cannot make a view of a view." + ) + self._is_view = True + if isinstance(oidx, (int, np.integer)): + if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): + raise IndexError(f"Observation index `{oidx}` is out of range.") + oidx += adata_ref.n_obs * (oidx < 0) + oidx = slice(oidx, oidx + 1, 1) + if isinstance(vidx, (int, np.integer)): + if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars): + raise IndexError(f"Variable index `{vidx}` is out of range.") + vidx += adata_ref.n_vars * (vidx < 0) + vidx = slice(vidx, vidx + 1, 1) + if adata_ref.is_view: + prev_oidx, prev_vidx = adata_ref._oidx, adata_ref._vidx + adata_ref = adata_ref._adata_ref + oidx, vidx = _resolve_idxs((prev_oidx, prev_vidx), (oidx, vidx), adata_ref) + # self._adata_ref is never a view + self._adata_ref = adata_ref + self._oidx = oidx + self._vidx = vidx + # the file is the same as of the reference object + self.file = adata_ref.file + # views on attributes of adata_ref + obs_sub = adata_ref.obs.iloc[oidx] + var_sub = adata_ref.var.iloc[vidx] + self._obsm = adata_ref.obsm._view(self, (oidx,)) + self._varm = adata_ref.varm._view(self, (vidx,)) + self._layers = adata_ref.layers._view(self, (oidx, vidx)) + self._obsp = adata_ref.obsp._view(self, oidx) + self._varp = adata_ref.varp._view(self, vidx) + # fix categories + uns = copy(adata_ref._uns) + self._remove_unused_categories(adata_ref.obs, obs_sub, uns) + self._remove_unused_categories(adata_ref.var, var_sub, uns) + # set attributes + self._obs = adata_ref.obs._view(self, (oidx,)) + self._var = adata_ref.var._view(self, (vidx,)) + self._uns = uns + self._n_obs = len( + self.obs["index"] if "index" in self.obs else self.obs["_index"] + ) + self._n_vars = len( + self.var["index"] if "index" in self.var else self.var["_index"] + ) + + # set data + if self.isbacked: + self._X = None + + # set raw, easy, as it’s immutable anyways... + if adata_ref._raw is not None: + # slicing along variables axis is ignored + self._raw = adata_ref.raw[oidx] + self._raw._adata = self + else: + self._raw = None + # TODO: this is not quite complete in the original but also here, what do we do about this? def __delitem__(self, index: Index): obs, var = self._normalize_indices(index) @@ -218,6 +287,11 @@ def __delitem__(self, index: Index): del X[obs, var] self._set_backed("X", X) + def __getitem__(self, index: Index) -> "AnnData": + """Returns a sliced view of the object.""" + oidx, vidx = self._normalize_indices(index) + return AnnDataRemote(self, oidx=oidx, vidx=vidx, asview=True) + @property def obs_names(self) -> pd.Index: return self._obs_names From f2f4acaa7c98cf8a4e39172046f4ecb390f66db5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 7 Mar 2023 11:11:21 +0100 Subject: [PATCH 035/125] (chore): remove from to-do list, at least for now --- anndata/experimental/read_remote/read_remote.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 6e151ad7e..3f874c72f 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -88,7 +88,6 @@ def __getitem__(self_iloc, idx): class AnnDataRemote(AnnData): # TODO's here: - # 1. Get an in-place copying system running # 2. Get a better sparse access pattern # 3. Re-write dataset with better chunking # 5. a `head` method From e8f654bf7b9dea69df101eb60d17a52f690d25c1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 7 Mar 2023 12:11:45 +0100 Subject: [PATCH 036/125] (feat): abstract `_init_as_actual` --- anndata/_core/anndata_base.py | 115 +++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) diff --git a/anndata/_core/anndata_base.py b/anndata/_core/anndata_base.py index 7a17faa9a..598a0a800 100644 --- a/anndata/_core/anndata_base.py +++ b/anndata/_core/anndata_base.py @@ -4,12 +4,123 @@ class AnnDataBase(metaclass=DeprecationMixinMeta): + def _init_as_actual( + self, + X=None, + obs=None, + var=None, + uns=None, + obsm=None, + varm=None, + varp=None, + obsp=None, + raw=None, + layers=None, + dtype=None, + shape=None, + filename=None, + filemode=None, + ): + ( + X, + obs, + var, + uns, + obsm, + varm, + obsp, + varp, + layers, + raw, + ) = self._reformat_axes_args_from_X( + self, + X, + obs, + var, + uns, + obsm, + varm, + obsp, + varp, + layers, + raw, + ) + self._assign_X(self, X, shape, dtype) + + self._initialize_indices() + assert self.n_obs == len( + self.obs_names + ) # after initializing indices, these should be True + assert self.n_obs == self.shape[0] + assert self.n_vars == len(self.var_names) + assert self.n_vars == self.shape[1] + if self.X is not None: + assert self.n_obs == self.X.shape[0] + assert self.n_vars == self.X.shape[1] + + self._assign_obs(obs) + self._assign_var(var) + self._assign_obsm(obsm) + self._assign_varm(varm) + self._assign_obsp(obsp) + self._assign_varp(varp) + self._assign_layers(layers) + self._run_checks() + self._cleanup() + @abstractmethod - def _init_as_actual(self, *args, **kwargs): + def _init_as_view(self, *args, **kwargs): pass @abstractmethod - def _init_as_view(self, *args, **kwargs): + def _assign_X(self, X, shape, dtype): + pass + + def _reformat_axes_args_from_X(self, *args): + return args + + @abstractmethod + def _initialize_indices(self, *args): + pass + + @abstractmethod + def _initialize_indices(self, *args): + pass + + @abstractmethod + def _assign_obs(self, obs): + pass + + @abstractmethod + def _assign_var(self, var): + pass + + @abstractmethod + def _assign_obsm(self, obsm): + pass + + @abstractmethod + def _assign_varm(self, varm): + pass + + @abstractmethod + def _assign_obsp(self, obsp): + pass + + @abstractmethod + def _assign_varp(self, varp): + pass + + @abstractmethod + def _assign_layers(self, layers): + pass + + @abstractmethod + def _run_checks(self, *args): + pass + + @abstractmethod + def _cleanup(self, *args): pass @property From 63aabc54ee33572ee7d9873e3c0bff0b884a6e51 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 7 Mar 2023 13:29:13 +0100 Subject: [PATCH 037/125] (feat): begin reorganizing anndata initialization --- anndata/_core/anndata.py | 136 +++++++++++++++------------------- anndata/_core/anndata_base.py | 71 +++++++++++------- 2 files changed, 105 insertions(+), 102 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 0cfdb8548..74fcbfd90 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -358,74 +358,44 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): else: self._raw = None - def _init_as_actual( - self, - X=None, - obs=None, - var=None, - uns=None, - obsm=None, - varm=None, - varp=None, - obsp=None, - raw=None, - layers=None, - dtype=None, - shape=None, - filename=None, - filemode=None, + def _reformat_axes_args_from_X( + self, X, obs, var, uns, obsm, varm, obsp, varp, layers, raw ): - # view attributes - self._is_view = False - self._adata_ref = None - self._oidx = None - self._vidx = None - - # ---------------------------------------------------------------------- - # various ways of initializing the data - # ---------------------------------------------------------------------- - - # If X is a data frame, we store its indices for verification x_indices = [] - - # init from file - if filename is not None: - self.file = AnnDataFileManager(self, filename, filemode) - else: - self.file = AnnDataFileManager(self, None) - - # init from AnnData - if isinstance(X, AnnData): - if any((obs, var, uns, obsm, varm, obsp, varp)): - raise ValueError( - "If `X` is a dict no further arguments must be provided." - ) - X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( - X._X, - X.obs, - X.var, - X.uns, - X.obsm, - X.varm, - X.obsp, - X.varp, - X.layers, - X.raw, + # init from AnnData + if isinstance(X, AnnData): + if any((obs, var, uns, obsm, varm, obsp, varp)): + raise ValueError( + "If `X` is a dict no further arguments must be provided." ) + X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( + X._X, + X.obs, + X.var, + X.uns, + X.obsm, + X.varm, + X.obsp, + X.varp, + X.layers, + X.raw, + ) - # init from DataFrame - elif isinstance(X, pd.DataFrame): - # to verify index matching, we wait until obs and var are DataFrames - if obs is None: - obs = pd.DataFrame(index=X.index) - elif not isinstance(X.index, pd.RangeIndex): - x_indices.append(("obs", "index", X.index)) - if var is None: - var = pd.DataFrame(index=X.columns) - elif not isinstance(X.columns, pd.RangeIndex): - x_indices.append(("var", "columns", X.columns)) - X = ensure_df_homogeneous(X, "X") - + # init from DataFrame + elif isinstance(X, pd.DataFrame): + # to verify index matching, we wait until obs and var are DataFrames + if obs is None: + obs = pd.DataFrame(index=X.index) + elif not isinstance(X.index, pd.RangeIndex): + x_indices.append(("obs", "index", X.index)) + if var is None: + var = pd.DataFrame(index=X.columns) + elif not isinstance(X.columns, pd.RangeIndex): + x_indices.append(("var", "columns", X.columns)) + X = ensure_df_homogeneous(X, "X") + return (X, obs, var, uns, obsm, varm, obsp, varp, layers, raw, x_indices) + + def _assign_X(self, X, shape, dtype): # ---------------------------------------------------------------------- # actually process the data # ---------------------------------------------------------------------- @@ -460,9 +430,18 @@ def _init_as_actual( X = np.array(X, dtype, copy=False) # data matrix and shape self._X = X - self._n_obs, self._n_vars = self._X.shape else: self._X = None + + def _initialize_indices(self, shape, obs, var): + # ---------------------------------------------------------------------- + # actually process the data + # ---------------------------------------------------------------------- + + # check data type of X + if self.X is not None: + self._n_obs, self._n_vars = self._X.shape + else: self._n_obs = len([] if obs is None else obs) self._n_vars = len([] if var is None else var) # check consistency with shape @@ -478,34 +457,38 @@ def _init_as_actual( if self._n_vars != shape[1]: raise ValueError("`shape` is inconsistent with `var`") - # annotations + # annotations + def _assign_obs(self, obs): self._obs = _gen_dataframe(obs, self._n_obs, ["obs_names", "row_names"]) - self._var = _gen_dataframe(var, self._n_vars, ["var_names", "col_names"]) - # now we can verify if indices match! - for attr_name, x_name, idx in x_indices: - attr = getattr(self, attr_name) - if isinstance(attr.index, pd.RangeIndex): - attr.index = idx - elif not idx.equals(attr.index): - raise ValueError(f"Index of {attr_name} must match {x_name} of X.") + def _assign_var(self, var): + self._var = _gen_dataframe(var, self._n_vars, ["var_names", "col_names"]) - # unstructured annotations + # unstructured annotations + def _assign_uns(self, uns): self.uns = uns or OrderedDict() - # TODO: Think about consequences of making obsm a group in hdf + # TODO: Think about consequences of making obsm a group in hdf + def _assign_obsm(self, obsm): self._obsm = AxisArrays(self, 0, vals=convert_to_dict(obsm)) + + def _assign_varm(self, varm): self._varm = AxisArrays(self, 1, vals=convert_to_dict(varm)) + def _assign_obsp(self, obsp): self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp)) + + def _assign_varp(self, varp): self._varp = PairwiseArrays(self, 1, vals=convert_to_dict(varp)) + def _run_checks(self): # Backwards compat for connectivities matrices in uns["neighbors"] _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) self._check_dimensions() self._check_uniqueness() + def _cleanup_raw_and_uns(self, raw, uns): if self.filename: assert not isinstance( raw, Raw @@ -522,6 +505,7 @@ def _init_as_actual( # clean up old formats self._clean_up_old_format(uns) + def _assign_layers(self, layers): # layers self._layers = Layers(self, layers) diff --git a/anndata/_core/anndata_base.py b/anndata/_core/anndata_base.py index 598a0a800..00c65b151 100644 --- a/anndata/_core/anndata_base.py +++ b/anndata/_core/anndata_base.py @@ -1,5 +1,7 @@ from abc import abstractmethod from typing import Tuple + +from .file_backing import AnnDataFileManager from ..utils import DeprecationMixinMeta @@ -21,33 +23,33 @@ def _init_as_actual( filename=None, filemode=None, ): - ( - X, - obs, - var, - uns, - obsm, - varm, - obsp, - varp, - layers, - raw, - ) = self._reformat_axes_args_from_X( - self, - X, - obs, - var, - uns, - obsm, - varm, - obsp, - varp, - layers, - raw, - ) + # view attributes + self._is_view = False + self._adata_ref = None + self._oidx = None + self._vidx = None + if filename is not None: + self.file = AnnDataFileManager(self, filename, filemode) + else: + self.file = AnnDataFileManager(self, None) + ( + X, + obs, + var, + uns, + obsm, + varm, + obsp, + varp, + layers, + raw, + x_indices, + ) = self._reformat_axes_args_from_X( + self, X, obs, var, uns, obsm, varm, obsp, varp, layers, raw + ) self._assign_X(self, X, shape, dtype) - self._initialize_indices() + self._initialize_indices(shape, obs, var) assert self.n_obs == len( self.obs_names ) # after initializing indices, these should be True @@ -60,13 +62,22 @@ def _init_as_actual( self._assign_obs(obs) self._assign_var(var) + # now we can verify if indices match! + for attr_name, x_name, idx in x_indices: + attr = getattr(self, attr_name) + if isinstance(attr.index, pd.RangeIndex): + attr.index = idx + elif not idx.equals(attr.index): + raise ValueError(f"Index of {attr_name} must match {x_name} of X.") + + self._assign_uns(uns) self._assign_obsm(obsm) self._assign_varm(varm) self._assign_obsp(obsp) self._assign_varp(varp) self._assign_layers(layers) self._run_checks() - self._cleanup() + self._cleanup_raw_and_uns(raw, uns) @abstractmethod def _init_as_view(self, *args, **kwargs): @@ -95,6 +106,14 @@ def _assign_obs(self, obs): def _assign_var(self, var): pass + @abstractmethod + def _assign_layers(self, layers): + pass + + @abstractmethod + def _assign_uns(self, uns): + pass + @abstractmethod def _assign_obsm(self, obsm): pass From 57760951d1bb59521160fc4f04103684dfa3b633 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 8 Mar 2023 14:48:23 +0100 Subject: [PATCH 038/125] (fix): fix args + checks so that tests run --- anndata/_core/anndata_base.py | 13 +++---------- anndata/_core/raw.py | 2 +- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/anndata/_core/anndata_base.py b/anndata/_core/anndata_base.py index 00c65b151..03faa3469 100644 --- a/anndata/_core/anndata_base.py +++ b/anndata/_core/anndata_base.py @@ -1,5 +1,6 @@ from abc import abstractmethod from typing import Tuple +import pandas as pd from .file_backing import AnnDataFileManager from ..utils import DeprecationMixinMeta @@ -45,16 +46,12 @@ def _init_as_actual( raw, x_indices, ) = self._reformat_axes_args_from_X( - self, X, obs, var, uns, obsm, varm, obsp, varp, layers, raw + X, obs, var, uns, obsm, varm, obsp, varp, layers, raw ) - self._assign_X(self, X, shape, dtype) + self._assign_X(X, shape, dtype) self._initialize_indices(shape, obs, var) - assert self.n_obs == len( - self.obs_names - ) # after initializing indices, these should be True assert self.n_obs == self.shape[0] - assert self.n_vars == len(self.var_names) assert self.n_vars == self.shape[1] if self.X is not None: assert self.n_obs == self.X.shape[0] @@ -94,10 +91,6 @@ def _reformat_axes_args_from_X(self, *args): def _initialize_indices(self, *args): pass - @abstractmethod - def _initialize_indices(self, *args): - pass - @abstractmethod def _assign_obs(self, obs): pass diff --git a/anndata/_core/raw.py b/anndata/_core/raw.py index 39cee1c4e..0a5dcd1f1 100644 --- a/anndata/_core/raw.py +++ b/anndata/_core/raw.py @@ -44,7 +44,7 @@ def _get_X(self, layer=None): @property def X(self) -> Union[SparseDataset, np.ndarray, sparse.spmatrix]: - # TODO: Handle unsorted array of integer indices for h5py.Datasets + # w Handle unsorted array of integer indices for h5py.Datasets if not self._adata.isbacked: return self._X if not self._adata.file.is_open: From 87cf635f39bbb6f651d0aa8d84ed3422a8f07941 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 8 Mar 2023 14:57:39 +0100 Subject: [PATCH 039/125] (fix): check _X for backed --- anndata/_core/anndata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 74fcbfd90..25809c67e 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -439,7 +439,7 @@ def _initialize_indices(self, shape, obs, var): # ---------------------------------------------------------------------- # check data type of X - if self.X is not None: + if self._X is not None: self._n_obs, self._n_vars = self._X.shape else: self._n_obs = len([] if obs is None else obs) From c50d9de08898b551e31d796350979e12960ee7f1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 8 Mar 2023 14:57:52 +0100 Subject: [PATCH 040/125] (fix): ensure x_indices exists --- anndata/_core/anndata_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anndata/_core/anndata_base.py b/anndata/_core/anndata_base.py index 03faa3469..a0b051fd6 100644 --- a/anndata/_core/anndata_base.py +++ b/anndata/_core/anndata_base.py @@ -29,6 +29,7 @@ def _init_as_actual( self._adata_ref = None self._oidx = None self._vidx = None + x_indices = [] if filename is not None: self.file = AnnDataFileManager(self, filename, filemode) else: From e384a19b0e30199066846c2d85c8c3adcdb9d1ac Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 8 Mar 2023 15:39:46 +0100 Subject: [PATCH 041/125] (feat): refactor `_init_as_actual` on remote anndata --- .../experimental/read_remote/read_remote.py | 110 ++++-------------- 1 file changed, 24 insertions(+), 86 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 3f874c72f..43aed25c0 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -87,44 +87,7 @@ def __getitem__(self_iloc, idx): class AnnDataRemote(AnnData): - # TODO's here: - # 2. Get a better sparse access pattern - # 3. Re-write dataset with better chunking - # 5. a `head` method - - def _init_as_actual( - self, - X=None, - obs=None, - var=None, - uns=None, - obsm=None, - varm=None, - varp=None, - obsp=None, - raw=None, - layers=None, - dtype=None, - shape=None, - filename=None, - filemode=None, - ): - # view attributes - self._is_view = False - self._adata_ref = None - self._oidx = None - self._vidx = None - - # ---------------------------------------------------------------------- - # various ways of initializing the data - # ---------------------------------------------------------------------- - - # check data type of X - if filename is not None: - self.file = AnnDataFileManager(self, filename, filemode) - else: - self.file = AnnDataFileManager(self, None) - + def _assign_X(self, X, shape, dtype): if X is not None: for s_type in StorageType: if isinstance(X, s_type.value): @@ -145,8 +108,20 @@ def _init_as_actual( self._n_obs, self._n_vars = self._X.shape else: self._X = None - self._n_obs = len([] if obs is None else obs) - self._n_vars = len([] if var is None else var) + + def _initialize_indices(self, shape, obs, var): + # annotations - need names already for AxisArrays to work. + self.obs_names = pd.Index( + (obs["index"] if "index" in obs else obs["_index"])[()] + ) + self.var_names = pd.Index( + (var["index"] if "index" in var else var["_index"])[()] + ) + if self._X is not None: + self._n_obs, self._n_vars = self._X.shape + else: + self._n_obs = len([] if obs is None else self.obs_names) + self._n_vars = len([] if var is None else self.var_names) # check consistency with shape if shape is not None: if self._n_obs == 0: @@ -160,58 +135,21 @@ def _init_as_actual( if self._n_vars != shape[1]: raise ValueError("`shape` is inconsistent with `var`") - # annotations - need names already for AxisArrays to work. - self.obs_names = pd.Index( - (obs["index"] if "index" in obs else obs["_index"])[()] - ) - self.var_names = pd.Index( - (var["index"] if "index" in var else var["_index"])[()] - ) + # annotations + def _assign_obs(self, obs): self._obs = AxisArraysRemote(self, 0, vals=convert_to_dict(obs)) - self._var = AxisArraysRemote(self, 1, vals=convert_to_dict(var)) - - # now we can verify if indices match! - # for attr_name, x_name, idx in x_indices: - # attr = getattr(self, attr_name) - # if isinstance(attr.index, pd.RangeIndex): - # attr.index = idx - # elif not idx.equals(attr.index): - # raise ValueError(f"Index of {attr_name} must match {x_name} of X.") - # unstructured annotations - self.uns = uns or OrderedDict() + def _assign_var(self, var): + self._var = AxisArraysRemote(self, 1, vals=convert_to_dict(var)) - # TODO: Think about consequences of making obsm a group in hdf + def _assign_obsm(self, obsm): self._obsm = AxisArraysRemote(self, 0, vals=convert_to_dict(obsm)) - self._varm = AxisArraysRemote(self, 1, vals=convert_to_dict(varm)) - - self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp)) - self._varp = PairwiseArrays(self, 1, vals=convert_to_dict(varp)) - - # Backwards compat for connectivities matrices in uns["neighbors"] - _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) - - # self._check_dimensions() - # self._check_uniqueness() - if self.filename: - assert not isinstance( - raw, Raw - ), "got raw from other adata but also filename?" - if {"raw", "raw.X"} & set(self.file): - raw = dict(X=None, **raw) - if not raw: - self._raw = None - elif isinstance(raw, cabc.Mapping): - self._raw = Raw(self, **raw) - else: # is a Raw from another AnnData - self._raw = Raw(self, raw._X, raw.var, raw.varm) - - # clean up old formats - self._clean_up_old_format(uns) + def _assign_varm(self, varm): + self._varm = AxisArraysRemote(self, 1, vals=convert_to_dict(varm)) - # layers - self._layers = Layers(self, layers) + def _run_checks(self): + pass # for now def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): if adata_ref.isbacked and adata_ref.is_view: From 4120ed327f15b94935b0f14c68092046cae53e15 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 8 Mar 2023 15:59:49 +0100 Subject: [PATCH 042/125] (fix): revert sparse changes. --- anndata/_core/sparse_dataset.py | 6 +++--- anndata/_io/specs/methods.py | 6 +++--- anndata/experimental/read_remote/read_remote.py | 3 +++ 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index e9c01d8f8..8654e8dd7 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -376,9 +376,9 @@ def to_backed(self) -> BackedSparseMatrix: def to_memory(self) -> ss.spmatrix: format_class = get_memory_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"] - mtx.indices = self.group["indices"] - mtx.indptr = self.group["indptr"] + mtx.data = self.group["data"][...] + mtx.indices = self.group["indices"][...] + mtx.indptr = self.group["indptr"][...] return mtx diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index ef4766e7c..9584f82d1 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -90,7 +90,7 @@ def read_basic(elem, _reader): if isinstance(elem, Mapping): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: - return SparseDataset(elem).to_backed() + return SparseDataset(elem).to_memory() return {k: _reader.read_elem(v) for k, v in elem.items()} elif isinstance(elem, h5py.Dataset): return h5ad.read_dataset(elem) # TODO: Handle legacy @@ -110,7 +110,7 @@ def read_basic_zarr(elem, _reader): if isinstance(elem, Mapping): # Backwards compat sparse arrays if "h5sparse_format" in elem.attrs: - return SparseDataset(elem).to_backed() + return SparseDataset(elem).to_memory() return {k: _reader.read_elem(v) for k, v in elem.items()} elif isinstance(elem, ZarrArray): return zarr.read_dataset(elem) # TODO: Handle legacy @@ -497,7 +497,7 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} @_REGISTRY.register_read(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse(elem, _reader): - return SparseDataset(elem).to_backed() + return SparseDataset(elem).to_memory() @_REGISTRY.register_read_partial(H5Group, IOSpec("csc_matrix", "0.1.0")) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 43aed25c0..f1d7d73c7 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -8,6 +8,7 @@ from anndata._core.file_backing import AnnDataFileManager from anndata._core.index import Index from anndata._core.raw import Raw +from anndata._core.sparse_dataset import SparseDataset from anndata._core.views import _resolve_idxs from anndata._io.specs.registry import read_elem from anndata.compat import _move_adj_mtx, _read_attr @@ -266,6 +267,8 @@ def callback(func, elem_name: str, elem, iospec): return CategoricalZarrArray(elem) elif iospec.encoding_type in {"array", "string_array"}: return elem + elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: + return SparseDataset(elem).to_backed() return func(elem) adata = read_dispatched(f, callback=callback) From 6d13c82702e37016c6ed2c09b09916f39b51294f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Mar 2023 15:22:19 +0100 Subject: [PATCH 043/125] (chore): revert erroneous comment change --- anndata/_core/raw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/raw.py b/anndata/_core/raw.py index 0a5dcd1f1..39cee1c4e 100644 --- a/anndata/_core/raw.py +++ b/anndata/_core/raw.py @@ -44,7 +44,7 @@ def _get_X(self, layer=None): @property def X(self) -> Union[SparseDataset, np.ndarray, sparse.spmatrix]: - # w Handle unsorted array of integer indices for h5py.Datasets + # TODO: Handle unsorted array of integer indices for h5py.Datasets if not self._adata.isbacked: return self._X if not self._adata.file.is_open: From 7c38b301397107c41fae6b8cfe8a17fd81f42954 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Mar 2023 16:24:35 +0100 Subject: [PATCH 044/125] (feat): consolidate metadata by default --- anndata/_io/zarr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py index d65379e63..f57e12755 100644 --- a/anndata/_io/zarr.py +++ b/anndata/_io/zarr.py @@ -48,6 +48,8 @@ def callback(func, s, k, elem, dataset_kwargs, iospec): func(s, k, elem, dataset_kwargs=dataset_kwargs) write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) + print(f) + zarr.consolidate_metadata(f.store) def read_zarr(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: From 515a6d2a7f23bdac158bd3f6974f5a904cdcd4e7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Mar 2023 16:33:59 +0100 Subject: [PATCH 045/125] (feat): begin unit tests --- anndata/tests/test_read_remote.py | 79 +++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) create mode 100644 anndata/tests/test_read_remote.py diff --git a/anndata/tests/test_read_remote.py b/anndata/tests/test_read_remote.py new file mode 100644 index 000000000..56511a0ff --- /dev/null +++ b/anndata/tests/test_read_remote.py @@ -0,0 +1,79 @@ +from pathlib import Path +import re + +import joblib +import pytest +import numpy as np +from scipy import sparse + +import anndata as ad +from anndata.tests.helpers import ( + as_dense_dask_array, + GEN_ADATA_DASK_ARGS, + gen_adata, + assert_equal, + subset_func, +) +from anndata.experimental.read_remote import read_remote +from anndata.utils import asarray + +subset_func2 = subset_func +# ------------------------------------------------------------------------------- +# Some test data +# ------------------------------------------------------------------------------- + + +@pytest.fixture +def adata(): + X_list = [ + [1, 2, 3], + [4, 5, 6], + [7, 8, 9], + ] # data matrix of shape n_obs x n_vars + X = np.array(X_list) + obs_dict = dict( # annotation of observations / rows + row_names=["name1", "name2", "name3"], # row annotation + oanno1=["cat1", "cat2", "cat2"], # categorical annotation + oanno2=["o1", "o2", "o3"], # string annotation + oanno3=[2.1, 2.2, 2.3], # float annotation + ) + var_dict = dict(vanno1=[3.1, 3.2, 3.3]) # annotation of variables / columns + uns_dict = dict( # unstructured annotation + oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"] + ) + return ad.AnnData( + X, + obs=obs_dict, + var=var_dict, + uns=uns_dict, + obsm=dict(o1=np.zeros((X.shape[0], 10))), + varm=dict(v1=np.ones((X.shape[1], 20))), + layers=dict(float=X.astype(float), sparse=sparse.csr_matrix(X)), + ) + + +@pytest.fixture( + params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], + ids=["scipy-csr", "scipy-csc", "np-array", "dask_array"], +) +def mtx_format(request): + return request.param + + +@pytest.fixture(params=[sparse.csr_matrix, sparse.csc_matrix]) +def sparse_format(request): + return request.param + + +def test_read_write_X(tmp_path, mtx_format): + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + # remote_pth = base_pth / "backed.zarr" + + orig = ad.AnnData(mtx_format(asarray(sparse.random(10, 10, format="csr")))) + orig.write_zarr(orig_pth) + + remote = read_remote(orig_pth) + # remote.write_zarr(remote_pth) # need to implement writing! + + assert np.all(asarray(orig.X) == asarray(remote.X)) From f4f5c7cdd5c06150acf460db518f928492011173 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 9 Mar 2023 16:49:46 +0100 Subject: [PATCH 046/125] (fix): fix index on `obs.to_df()` --- .../experimental/read_remote/read_remote.py | 21 ++++++++++--------- anndata/tests/test_read_remote.py | 12 +++++++++++ 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index f1d7d73c7..045e3bd83 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -66,16 +66,17 @@ def to_df(self) -> pd.DataFrame: """Convert to pandas dataframe.""" df = pd.DataFrame(index=self.dim_names) for key in self.keys(): - z = self[key] - if isinstance(z, zarr.Group) and "codes" in z: # catrgoricql - value = pd.Categorical.from_codes( - codes=read_elem(z["codes"]), - categories=read_elem(z["categories"]), - ordered=bool(_read_attr(z.attrs, "ordered")), - ) - else: - value = z[()] - df[key] = value + if "index" not in key: + z = self[key] + if isinstance(z, zarr.Group) and "codes" in z: # catrgoricql + value = pd.Categorical.from_codes( + codes=read_elem(z["codes"]), + categories=read_elem(z["categories"]), + ordered=bool(_read_attr(z.attrs, "ordered")), + ) + else: + value = z[()] + df[key] = value return df @property diff --git a/anndata/tests/test_read_remote.py b/anndata/tests/test_read_remote.py index 56511a0ff..1f8c97dec 100644 --- a/anndata/tests/test_read_remote.py +++ b/anndata/tests/test_read_remote.py @@ -77,3 +77,15 @@ def test_read_write_X(tmp_path, mtx_format): # remote.write_zarr(remote_pth) # need to implement writing! assert np.all(asarray(orig.X) == asarray(remote.X)) + assert (orig.obs == remote.obs.to_df()).all().all() + assert (orig.var == remote.var.to_df()).all().all() + + +def test_read_write_full(adata, tmp_path): + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + adata.write_zarr(orig_pth) + remote = read_remote(orig_pth) + assert np.all(asarray(adata.X) == asarray(remote.X)) + assert (adata.obs == remote.obs.to_df()).all().all() + assert (adata.var == remote.var.to_df()).all().all() From 60fae255818d1e78abcd121d88000810bd1303e6 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Thu, 16 Mar 2023 10:05:29 +0100 Subject: [PATCH 047/125] (feat): swap out `zarr` categorical array for `xarray` (#946) * (feat): swap out `zarr` categorical array for `xarray` * (chore): siimplify `to_df` * (chore): add xarray dep * (fix): don't add `index` to df twice * (chore): add `LazyCategoricalArray` test * (feat): add full array `__eq__` * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * (chore): split tests * (chore): fully remove unused variables * (chore): remove auto-generated typing * (chore): return reading in categories at __init__ * (fix): use `any` not `some` * (feat): add cache workaround * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- anndata/experimental/read_remote/__init__.py | 2 +- .../experimental/read_remote/read_remote.py | 78 ++++++++++--------- anndata/tests/test_read_remote.py | 35 +++++++-- pyproject.toml | 1 + 4 files changed, 72 insertions(+), 44 deletions(-) diff --git a/anndata/experimental/read_remote/__init__.py b/anndata/experimental/read_remote/__init__.py index 1622afcb5..54657205f 100644 --- a/anndata/experimental/read_remote/__init__.py +++ b/anndata/experimental/read_remote/__init__.py @@ -1 +1 @@ -from .read_remote import read_remote +from .read_remote import read_remote, LazyCategoricalArray diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 045e3bd83..8c5f9ce28 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -2,7 +2,7 @@ from copy import copy from functools import cached_property from pathlib import Path -from typing import Any, MutableMapping, Union, List, Sequence +from typing import Any, MutableMapping, Union, List, Sequence, Tuple from anndata._core.aligned_mapping import Layers, PairwiseArrays from anndata._core.anndata import StorageType, _check_2d_shape, _gen_dataframe from anndata._core.file_backing import AnnDataFileManager @@ -17,43 +17,60 @@ import zarr import pandas as pd import numpy as np +from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin from ..._core import AnnData, AxisArrays from .. import read_dispatched -# TODO: Do we really need to subclass the Array class here? Ryan Abernathy seems to say "no" -# but I don't really want to mess with the methods. The downside is that (for some reason), it's -# reading the `zarray` of the `codes` path which should not have to happen, but I can't figure out a way around it. -class CategoricalZarrArray(zarr.core.Array): +class LazyCategoricalArray(ExplicitlyIndexedNDArrayMixin): + __slots__ = ("codes", "attrs", "_categories", "_categories_cache") + def __init__(self, group, *args, **kwargs): - codes_path = group.path + "/codes" - super().__init__(group.store.store, codes_path, *args, **kwargs) + """Class for lazily reading categorical data from formatted zarr group + + Args: + group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr + """ + self.codes = group["codes"] self._categories = group["categories"] - self._group_attrs = group.attrs + self._categories_cache = None + self.attrs = dict(group.attrs) + + @property + def categories(self): # __slots__ and cached_property are incompatible + if self._categories_cache is None: + self._categories_cache = self._categories[...] + return self._categories_cache - @cached_property - def categories(self): - return self._categories[()] + @property + def dtype(self) -> pd.CategoricalDtype: + return pd.CategoricalDtype(self.categories, self.ordered) - @cached_property - def ordered(self): - return bool(_read_attr(self._group_attrs, "ordered")) + @property + def shape(self) -> Tuple[int, ...]: + return self.codes.shape - def __array__(self, *args): # may need to override this, copied for now - a = self[...] - if args: - a = a.astype(args[0]) - return a + @property + def ordered(self): + return bool(self.attrs["ordered"]) - def __getitem__(self, selection): - result = super().__getitem__(selection) + def __getitem__(self, selection) -> pd.Categorical: + codes = self.codes.oindex[selection] + if codes.shape == (): # handle 0d case + codes = np.array([codes]) return pd.Categorical.from_codes( - codes=result, + codes=codes, categories=self.categories, ordered=self.ordered, ) + def __repr__(self) -> str: + return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" + + def __eq__(self, __o) -> bool: + return self[()] == __o + class AxisArraysRemote(AxisArrays): def __getattr__(self, __name: str): @@ -67,16 +84,7 @@ def to_df(self) -> pd.DataFrame: df = pd.DataFrame(index=self.dim_names) for key in self.keys(): if "index" not in key: - z = self[key] - if isinstance(z, zarr.Group) and "codes" in z: # catrgoricql - value = pd.Categorical.from_codes( - codes=read_elem(z["codes"]), - categories=read_elem(z["categories"]), - ordered=bool(_read_attr(z.attrs, "ordered")), - ) - else: - value = z[()] - df[key] = value + df[key] = self[key][()] return df @property @@ -181,8 +189,6 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): # the file is the same as of the reference object self.file = adata_ref.file # views on attributes of adata_ref - obs_sub = adata_ref.obs.iloc[oidx] - var_sub = adata_ref.var.iloc[vidx] self._obsm = adata_ref.obsm._view(self, (oidx,)) self._varm = adata_ref.varm._view(self, (vidx,)) self._layers = adata_ref.layers._view(self, (oidx, vidx)) @@ -190,8 +196,6 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): self._varp = adata_ref.varp._view(self, vidx) # fix categories uns = copy(adata_ref._uns) - self._remove_unused_categories(adata_ref.obs, obs_sub, uns) - self._remove_unused_categories(adata_ref.var, var_sub, uns) # set attributes self._obs = adata_ref.obs._view(self, (oidx,)) self._var = adata_ref.var._view(self, (vidx,)) @@ -265,7 +269,7 @@ def callback(func, elem_name: str, elem, iospec): # override to only return AxisArray that will be accessed specially via our special AnnData object return {k: read_dispatched(v, callback) for k, v in elem.items()} elif iospec.encoding_type == "categorical": - return CategoricalZarrArray(elem) + return LazyCategoricalArray(elem) elif iospec.encoding_type in {"array", "string_array"}: return elem elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: diff --git a/anndata/tests/test_read_remote.py b/anndata/tests/test_read_remote.py index 1f8c97dec..73d1fbd93 100644 --- a/anndata/tests/test_read_remote.py +++ b/anndata/tests/test_read_remote.py @@ -1,20 +1,17 @@ from pathlib import Path -import re -import joblib import pytest import numpy as np +import pandas as pd from scipy import sparse +import zarr import anndata as ad from anndata.tests.helpers import ( as_dense_dask_array, - GEN_ADATA_DASK_ARGS, - gen_adata, - assert_equal, subset_func, ) -from anndata.experimental.read_remote import read_remote +from anndata.experimental.read_remote import read_remote, LazyCategoricalArray from anndata.utils import asarray subset_func2 = subset_func @@ -65,6 +62,17 @@ def sparse_format(request): return request.param +@pytest.fixture() +def categorical_zarr_group(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["codes"] = [0, 1, 0, 1, 1, 2, 2] + z["categories"] = ["foo", "bar", "jazz"] + z.attrs["ordered"] = False + z = zarr.open(base_path) + return z + + def test_read_write_X(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" @@ -89,3 +97,18 @@ def test_read_write_full(adata, tmp_path): assert np.all(asarray(adata.X) == asarray(remote.X)) assert (adata.obs == remote.obs.to_df()).all().all() assert (adata.var == remote.var.to_df()).all().all() + + +def test_lazy_categorical_array_properties(categorical_zarr_group): + arr = LazyCategoricalArray(categorical_zarr_group) + assert len(arr[0:3]) == 3 + assert type(arr[0:3]) == pd.Categorical + assert len(arr[()]) == len(arr) + assert type(arr[()]) == pd.Categorical + + +def test_lazy_categorical_array_equality(categorical_zarr_group): + arr = LazyCategoricalArray(categorical_zarr_group) + assert (arr[0] == "foo").all() + assert (arr[3:5] == "bar").all() + assert (arr == "foo").any() diff --git a/pyproject.toml b/pyproject.toml index 844652641..dc87ef1d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,6 +62,7 @@ dev = [ # static checking "black>=20.8b1", "docutils", + "xarray>=2023.1.0" ] doc = [ "sphinx>=4.4", From 7d2d39f64b24f39ef51d74e369e8a9e2321303fc Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Mar 2023 12:44:12 +0100 Subject: [PATCH 048/125] (fix): implement `__ne__` --- anndata/experimental/read_remote/read_remote.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 8c5f9ce28..270418433 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -68,9 +68,11 @@ def __getitem__(self, selection) -> pd.Categorical: def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" - def __eq__(self, __o) -> bool: + def __eq__(self, __o) -> np.ndarray: return self[()] == __o - + + def __ne__(self, __o) -> np.ndarray: + return ~(self == __o) class AxisArraysRemote(AxisArrays): def __getattr__(self, __name: str): From 7988c4b1478294699e96e730087351bfd177987f Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Mar 2023 12:44:33 +0100 Subject: [PATCH 049/125] (feat): try not reading in index --- anndata/experimental/read_remote/read_remote.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 270418433..97408c05c 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -123,12 +123,8 @@ def _assign_X(self, X, shape, dtype): def _initialize_indices(self, shape, obs, var): # annotations - need names already for AxisArrays to work. - self.obs_names = pd.Index( - (obs["index"] if "index" in obs else obs["_index"])[()] - ) - self.var_names = pd.Index( - (var["index"] if "index" in var else var["_index"])[()] - ) + self.obs_names = obs["index"] if "index" in obs else obs["_index"] + self.var_names = var["index"] if "index" in var else var["_index"] if self._X is not None: self._n_obs, self._n_vars = self._X.shape else: From 70165bb6c17a343a8d0b02ea2b43248d451b4425 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Mar 2023 12:44:46 +0100 Subject: [PATCH 050/125] (fix): `string_array` -> `string-array` --- anndata/experimental/read_remote/read_remote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 97408c05c..fec095c5c 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -268,7 +268,7 @@ def callback(func, elem_name: str, elem, iospec): return {k: read_dispatched(v, callback) for k, v in elem.items()} elif iospec.encoding_type == "categorical": return LazyCategoricalArray(elem) - elif iospec.encoding_type in {"array", "string_array"}: + elif iospec.encoding_type in {"array", "string-array"}: return elem elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: return SparseDataset(elem).to_backed() From 5f19a37269f05b0d39019fed583a881ec2fab6e1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 21 Mar 2023 11:58:43 +0000 Subject: [PATCH 051/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/experimental/read_remote/read_remote.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index fec095c5c..4ef1fc230 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -70,9 +70,10 @@ def __repr__(self) -> str: def __eq__(self, __o) -> np.ndarray: return self[()] == __o - + def __ne__(self, __o) -> np.ndarray: - return ~(self == __o) + return ~(self == __o) + class AxisArraysRemote(AxisArrays): def __getattr__(self, __name: str): From d159805e62b4dc859c544362b1477ab669d38d1a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Mar 2023 13:07:09 +0100 Subject: [PATCH 052/125] (chore): remove large comment --- anndata/_core/sparse_dataset.py | 156 -------------------------------- 1 file changed, 156 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index ddfa30c99..206e0e307 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -392,162 +392,6 @@ class CSRDataset(BaseCompressedSparseDataset): class CSCDataset(BaseCompressedSparseDataset): format_str = "csc" - -# class SparseDataset: -# """Analogous to :class:`h5py.Dataset `, but for sparse matrices.""" - -# def __init__(self, group: h5py.Group): -# self.group = group - -# @property -# def dtype(self) -> np.dtype: -# return self.group["data"].dtype - -# @property -# def format_str(self) -> str: -# if "h5sparse_format" in self.group.attrs: -# return _read_attr(self.group.attrs, "h5sparse_format") -# else: -# # Should this be an extra field? -# return _read_attr(self.group.attrs, "encoding-type").replace("_matrix", "") - -# @property -# def h5py_group(self) -> h5py.Group: -# warn( -# "Attribute `h5py_group` of SparseDatasets is deprecated. " -# "Use `group` instead.", -# DeprecationWarning, -# ) -# return self.group - -# @property -# def name(self) -> str: -# return self.group.name - -# @property -# def file(self) -> h5py.File: -# return self.group.file - -# @property -# def shape(self) -> Tuple[int, int]: -# shape = self.group.attrs.get("h5sparse_shape") -# return tuple(self.group.attrs["shape"] if shape is None else shape) - -# @property -# def value(self) -> ss.spmatrix: -# return self.to_memory() - -# def __repr__(self) -> str: -# return ( -# f"' -# ) - -# def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: -# row, col = self._normalize_index(index) -# mtx = self.to_backed() -# sub = mtx[row, col] -# # If indexing is array x array it returns a backed_sparse_matrix -# # Not sure what the performance is on that operation -# if isinstance(sub, BackedSparseMatrix): -# return get_memory_class(self.format_str)(sub) -# else: -# return sub - -# def __setitem__(self, index: Union[Index, Tuple[()]], value): -# row, col = self._normalize_index(index) -# mock_matrix = self.to_backed() -# mock_matrix[row, col] = value - -# def _normalize_index( -# self, index: Union[Index, Tuple[()]] -# ) -> Tuple[np.ndarray, np.ndarray]: -# if index == (): -# index = slice(None) -# row, col = unpack_index(index) -# if all(isinstance(x, cabc.Iterable) for x in (row, col)): -# row, col = np.ix_(row, col) -# return row, col - -# def append(self, sparse_matrix: ss.spmatrix): -# # Prep variables -# shape = self.shape -# if isinstance(sparse_matrix, SparseDataset): -# sparse_matrix = sparse_matrix.to_backed() - -# # Check input -# if not ss.isspmatrix(sparse_matrix): -# raise NotImplementedError( -# "Currently, only sparse matrices of equivalent format can be " -# "appended to a SparseDataset." -# ) -# if self.format_str not in {"csr", "csc"}: -# raise NotImplementedError( -# f"The append method for format {self.format_str} " -# f"is not implemented." -# ) -# if self.format_str != get_format_str(sparse_matrix): -# raise ValueError( -# f"Matrices must have same format. Currently are " -# f"{self.format_str!r} and {get_format_str(sparse_matrix)!r}" -# ) - -# # shape -# if self.format_str == "csr": -# assert ( -# shape[1] == sparse_matrix.shape[1] -# ), "CSR matrices must have same size of dimension 1 to be appended." -# new_shape = (shape[0] + sparse_matrix.shape[0], shape[1]) -# elif self.format_str == "csc": -# assert ( -# shape[0] == sparse_matrix.shape[0] -# ), "CSC matrices must have same size of dimension 0 to be appended." -# new_shape = (shape[0], shape[1] + sparse_matrix.shape[1]) -# else: -# assert False, "We forgot to update this branching to a new format" -# if "h5sparse_shape" in self.group.attrs: -# del self.group.attrs["h5sparse_shape"] -# self.group.attrs["shape"] = new_shape - -# # data -# data = self.group["data"] -# orig_data_size = data.shape[0] -# data.resize((orig_data_size + sparse_matrix.data.shape[0],)) -# data[orig_data_size:] = sparse_matrix.data - -# # indptr -# indptr = self.group["indptr"] -# orig_data_size = indptr.shape[0] -# append_offset = indptr[-1] -# indptr.resize((orig_data_size + sparse_matrix.indptr.shape[0] - 1,)) -# indptr[orig_data_size:] = ( -# sparse_matrix.indptr[1:].astype(np.int64) + append_offset -# ) - -# # indices -# indices = self.group["indices"] -# orig_data_size = indices.shape[0] -# indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) -# indices[orig_data_size:] = sparse_matrix.indices - -# def _to_backed(self) -> BackedSparseMatrix: -# format_class = get_backed_class(self.format_str) -# mtx = format_class(self.shape, dtype=self.dtype) -# mtx.data = self.group["data"] -# mtx.indices = self.group["indices"] -# mtx.indptr = self.group["indptr"][:] -# return mtx - -# def to_memory(self) -> ss.spmatrix: -# format_class = get_memory_class(self.format_str) -# mtx = format_class(self.shape, dtype=self.dtype) -# mtx.data = self.group["data"][...] -# mtx.indices = self.group["indices"][...] -# mtx.indptr = self.group["indptr"][...] -# return mtx - - def sparse_dataset(group) -> BaseCompressedSparseDataset: # encoding_type = _read_attr(group, "encoding-type") encoding_type = _get_group_format(group) From ff2fdfaf8ef2e5a6837907ecba0d1fc8b7fdeecf Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Mar 2023 13:08:58 +0100 Subject: [PATCH 053/125] (style): `_to_backed` -> `to_backed` --- anndata/_core/sparse_dataset.py | 6 +++--- anndata/_io/specs/methods.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 206e0e307..112d154cc 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -281,7 +281,7 @@ def __repr__(self) -> str: def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: row, col = self._normalize_index(index) - mtx = self._to_backed() + mtx = self.to_backed() sub = mtx[row, col] # If indexing is array x array it returns a backed_sparse_matrix # Not sure what the performance is on that operation @@ -311,7 +311,7 @@ def append(self, sparse_matrix: ss.spmatrix): # Prep variables shape = self.shape if isinstance(sparse_matrix, BaseCompressedSparseDataset): - sparse_matrix = sparse_matrix._to_backed() + sparse_matrix = sparse_matrix.to_backed() # Check input if not ss.isspmatrix(sparse_matrix): @@ -368,7 +368,7 @@ def append(self, sparse_matrix: ss.spmatrix): indices.resize((orig_data_size + sparse_matrix.indices.shape[0],)) indices[orig_data_size:] = sparse_matrix.indices - def _to_backed(self) -> BackedSparseMatrix: + def to_backed(self) -> BackedSparseMatrix: format_class = get_backed_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"] diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 808bece83..ab37035cb 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -471,7 +471,7 @@ def write_sparse_compressed( @_REGISTRY.register_write(ZarrGroup, CSCDataset, IOSpec("", "0.1.0")) def write_sparse_dataset(f, k, elem, dataset_kwargs=MappingProxyType({})): write_sparse_compressed( - f, k, elem._to_backed(), fmt=elem.format_str, dataset_kwargs=dataset_kwargs + f, k, elem.to_backed(), fmt=elem.format_str, dataset_kwargs=dataset_kwargs ) # TODO: Cleaner way to do this f[k].attrs["encoding-type"] = f"{elem.format_str}_matrix" From 01089fcaa2ce339cb6d20a508d19304b9ecb113e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Mar 2023 13:12:43 +0100 Subject: [PATCH 054/125] (fix): revert backed test --- anndata/tests/test_backed_sparse.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index a4f95080a..2f8e2eab1 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -146,4 +146,4 @@ def test_wrong_formats(tmp_path): post_checks = disk_mtx.to_memory() # Check nothing changed - assert not np.any((pre_checks != post_checks).toarray()) + assert not np.any((pre_checks != post_checks).toarray()) \ No newline at end of file From 28f0218e52d59e9dc4a0596bdcd3ef3e618543e7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 21 Mar 2023 13:53:09 +0100 Subject: [PATCH 055/125] (fix): add basic zarr backed reading for test --- anndata/tests/test_backed_sparse.py | 40 +++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index 2f8e2eab1..cc3ba2608 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -4,8 +4,13 @@ from scipy import sparse import anndata as ad +from anndata._core.anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset +from anndata._io.zarr import read_dataframe from anndata.tests.helpers import assert_equal, subset_func +from anndata.experimental import read_dispatched + +import zarr subset_func2 = subset_func @@ -21,7 +26,6 @@ def ondisk_equivalent_adata(tmp_path, diskfmt): csc_path = tmp_path / f"csc.{diskfmt}" dense_path = tmp_path / f"dense.{diskfmt}" - read = lambda x, **kwargs: getattr(ad, f"read_{diskfmt}")(x, **kwargs) write = lambda x, pth, **kwargs: getattr(x, f"write_{diskfmt}")(pth, **kwargs) csr_mem = ad.AnnData(X=sparse.random(50, 50, format="csr", density=0.1)) @@ -32,10 +36,36 @@ def ondisk_equivalent_adata(tmp_path, diskfmt): write(csc_mem, csc_path) # write(csr_mem, dense_path, as_dense="X") write(dense_mem, dense_path) - - csr_disk = read(csr_path, backed="r") - csc_disk = read(csc_path, backed="r") - dense_disk = read(dense_path, backed="r") + if diskfmt == "h5ad": + csr_disk = ad.read_h5ad(csr_path, backed="r") + csc_disk = ad.read_h5ad(csc_path, backed="r") + dense_disk = ad.read_h5ad(dense_path, backed="r") + else: + def read_zarr_backed(path): + path = str(path) + + f = zarr.open(path, mode="r") + + # Read with handling for backwards compat + def callback(func, elem_name, elem, iospec): + if iospec.encoding_type == "anndata" or elem_name.endswith("/"): + return AnnData( + **{ + k: read_dispatched(v, callback) + for k, v in elem.items() + } + ) + if iospec.encoding_type in {"csc_matrix", "csr_matrix"}: + return sparse_dataset(elem).to_backed() + return func(elem) + + adata = read_dispatched(f, callback=callback) + + return adata + + csr_disk = read_zarr_backed(csr_path) + csc_disk = read_zarr_backed(csc_path) + dense_disk = read_zarr_backed(dense_path) return csr_mem, csr_disk, csc_disk, dense_disk From c552a73237f637d827697c25601c707b6905c8ff Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 23 Mar 2023 17:32:14 +0100 Subject: [PATCH 056/125] (feat): add support for non-consolidated stores --- .../experimental/read_remote/read_remote.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 4ef1fc230..7f00fa1ed 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -255,18 +255,35 @@ def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: if isinstance(store, Path): store = str(store) - f = zarr.open_consolidated(store, mode="r") + is_consolidated = True + try: + f = zarr.open_consolidated(store, mode="r") + except KeyError: + is_consolidated = False + f = zarr.open(store, mode="r") def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): + cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"] + iter_object = ( + elem.items() + if is_consolidated + else [(k, elem[k]) for k in cols if k in elem] + ) return AnnDataRemote( - **{k: read_dispatched(v, callback) for k, v in elem.items()} + **{k: read_dispatched(v, callback) for k, v in iter_object} ) elif elem_name.startswith("/raw"): return None elif elem_name in {"/obs", "/var"}: # override to only return AxisArray that will be accessed specially via our special AnnData object - return {k: read_dispatched(v, callback) for k, v in elem.items()} + iter_object = ( + elem.items() + if is_consolidated + else [(k, elem[k]) for k in elem.attrs["column-order"]] + + [(elem.attrs["_index"], elem[elem.attrs["_index"]])] + ) + return {k: read_dispatched(v, callback) for k, v in iter_object} elif iospec.encoding_type == "categorical": return LazyCategoricalArray(elem) elif iospec.encoding_type in {"array", "string-array"}: From 58964c390af63dd316d479a45c7293ceae873326 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 28 Mar 2023 18:08:35 +0200 Subject: [PATCH 057/125] (feat): first stage of base class refactor --- anndata/_core/aligned_mapping.py | 4 +- anndata/_core/anndata.py | 145 ++++---- anndata/_core/anndata_base.py | 145 ++------ anndata/_core/raw.py | 1 + .../experimental/read_remote/read_remote.py | 321 ++++++++++++------ 5 files changed, 330 insertions(+), 286 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index b99325727..8e0777b01 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -119,7 +119,9 @@ def copy(self): def _view(self, parent: "anndata.AnnData", subset_idx: I): """Returns a subset copy-on-write view of the object.""" - return self._view_class(self, parent, subset_idx) + if parent.is_view and subset_idx is not None: # and or or? + return self._view_class(self, parent, subset_idx) + return self @deprecated("dict(obj)") def as_dict(self) -> dict: diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 3a3206aab..236284d8a 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -22,13 +22,13 @@ from pandas.api.types import infer_dtype, is_string_dtype, is_categorical_dtype from scipy import sparse from scipy.sparse import issparse, csr_matrix +from anndata._core.anndata_base import AbstractAnnData from anndata._warnings import ImplicitModificationWarning from .raw import Raw from .index import _normalize_indices, _subset, Index, Index1D, get_vector from .file_backing import AnnDataFileManager, to_memory from .access import ElementRef -from .anndata_base import AbstractAnnData from .aligned_mapping import ( AxisArrays, AxisArraysView, @@ -279,7 +279,7 @@ def __init__( vidx: Index1D = None, ): if asview: - if not issubclass(type(X), AnnData): + if not issubclass(type(X), AbstractAnnData): raise ValueError("`X` has to be an AnnData object.") self._init_as_view(X, oidx, vidx) else: @@ -358,44 +358,74 @@ def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): else: self._raw = None - def _reformat_axes_args_from_X( - self, X, obs, var, uns, obsm, varm, obsp, varp, layers, raw + def _init_as_actual( + self, + X=None, + obs=None, + var=None, + uns=None, + obsm=None, + varm=None, + varp=None, + obsp=None, + raw=None, + layers=None, + dtype=None, + shape=None, + filename=None, + filemode=None, ): + # view attributes + self._is_view = False + self._adata_ref = None + self._oidx = None + self._vidx = None + + # ---------------------------------------------------------------------- + # various ways of initializing the data + # ---------------------------------------------------------------------- + + # If X is a data frame, we store its indices for verification x_indices = [] - # init from AnnData - if isinstance(X, AnnData): - if any((obs, var, uns, obsm, varm, obsp, varp)): - raise ValueError( - "If `X` is a dict no further arguments must be provided." + + # init from file + if filename is not None: + self.file = AnnDataFileManager(self, filename, filemode) + else: + self.file = AnnDataFileManager(self, None) + + # init from AnnData + if issubclass(type(X), AbstractAnnData): + if any((obs, var, uns, obsm, varm, obsp, varp)): + raise ValueError( + "If `X` is a dict no further arguments must be provided." + ) + X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( + X._X, + X.obs, + X.var, + X.uns, + X.obsm, + X.varm, + X.obsp, + X.varp, + X.layers, + X.raw, ) - X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( - X._X, - X.obs, - X.var, - X.uns, - X.obsm, - X.varm, - X.obsp, - X.varp, - X.layers, - X.raw, - ) - # init from DataFrame - elif isinstance(X, pd.DataFrame): - # to verify index matching, we wait until obs and var are DataFrames - if obs is None: - obs = pd.DataFrame(index=X.index) - elif not isinstance(X.index, pd.RangeIndex): - x_indices.append(("obs", "index", X.index)) - if var is None: - var = pd.DataFrame(index=X.columns) - elif not isinstance(X.columns, pd.RangeIndex): - x_indices.append(("var", "columns", X.columns)) - X = ensure_df_homogeneous(X, "X") - return (X, obs, var, uns, obsm, varm, obsp, varp, layers, raw, x_indices) - - def _assign_X(self, X, shape, dtype): + # init from DataFrame + elif isinstance(X, pd.DataFrame): + # to verify index matching, we wait until obs and var are DataFrames + if obs is None: + obs = pd.DataFrame(index=X.index) + elif not isinstance(X.index, pd.RangeIndex): + x_indices.append(("obs", "index", X.index)) + if var is None: + var = pd.DataFrame(index=X.columns) + elif not isinstance(X.columns, pd.RangeIndex): + x_indices.append(("var", "columns", X.columns)) + X = ensure_df_homogeneous(X, "X") + # ---------------------------------------------------------------------- # actually process the data # ---------------------------------------------------------------------- @@ -430,18 +460,9 @@ def _assign_X(self, X, shape, dtype): X = np.array(X, dtype, copy=False) # data matrix and shape self._X = X - else: - self._X = None - - def _initialize_indices(self, shape, obs, var): - # ---------------------------------------------------------------------- - # actually process the data - # ---------------------------------------------------------------------- - - # check data type of X - if self._X is not None: self._n_obs, self._n_vars = self._X.shape else: + self._X = None self._n_obs = len([] if obs is None else obs) self._n_vars = len([] if var is None else var) # check consistency with shape @@ -457,38 +478,34 @@ def _initialize_indices(self, shape, obs, var): if self._n_vars != shape[1]: raise ValueError("`shape` is inconsistent with `var`") - # annotations - def _assign_obs(self, obs): + # annotations self._obs = _gen_dataframe(obs, self._n_obs, ["obs_names", "row_names"]) - - def _assign_var(self, var): self._var = _gen_dataframe(var, self._n_vars, ["var_names", "col_names"]) - # unstructured annotations - def _assign_uns(self, uns): + # now we can verify if indices match! + for attr_name, x_name, idx in x_indices: + attr = getattr(self, attr_name) + if isinstance(attr.index, pd.RangeIndex): + attr.index = idx + elif not idx.equals(attr.index): + raise ValueError(f"Index of {attr_name} must match {x_name} of X.") + + # unstructured annotations self.uns = uns or OrderedDict() - # TODO: Think about consequences of making obsm a group in hdf - def _assign_obsm(self, obsm): + # TODO: Think about consequences of making obsm a group in hdf self._obsm = AxisArrays(self, 0, vals=convert_to_dict(obsm)) - - def _assign_varm(self, varm): self._varm = AxisArrays(self, 1, vals=convert_to_dict(varm)) - def _assign_obsp(self, obsp): self._obsp = PairwiseArrays(self, 0, vals=convert_to_dict(obsp)) - - def _assign_varp(self, varp): self._varp = PairwiseArrays(self, 1, vals=convert_to_dict(varp)) - def _run_checks(self): # Backwards compat for connectivities matrices in uns["neighbors"] _move_adj_mtx({"uns": self._uns, "obsp": self._obsp}) self._check_dimensions() self._check_uniqueness() - def _cleanup_raw_and_uns(self, raw, uns): if self.filename: assert not isinstance( raw, Raw @@ -505,7 +522,6 @@ def _cleanup_raw_and_uns(self, raw, uns): # clean up old formats self._clean_up_old_format(uns) - def _assign_layers(self, layers): # layers self._layers = Layers(self, layers) @@ -566,6 +582,11 @@ def __eq__(self, other): "instead compare the desired attributes." ) + @property + def shape(self) -> Tuple[int, int]: + """Shape of data matrix (:attr:`n_obs`, :attr:`n_vars`).""" + return self.n_obs, self.n_vars + @property def X(self) -> Optional[Union[np.ndarray, sparse.spmatrix, ArrayView]]: """Data matrix of shape :attr:`n_obs` × :attr:`n_vars`.""" diff --git a/anndata/_core/anndata_base.py b/anndata/_core/anndata_base.py index bbf7b5a86..24d58ca6e 100644 --- a/anndata/_core/anndata_base.py +++ b/anndata/_core/anndata_base.py @@ -7,173 +7,84 @@ class AbstractAnnData(metaclass=DeprecationMixinMeta): - def _init_as_actual( - self, - X=None, - obs=None, - var=None, - uns=None, - obsm=None, - varm=None, - varp=None, - obsp=None, - raw=None, - layers=None, - dtype=None, - shape=None, - filename=None, - filemode=None, - ): - # view attributes - self._is_view = False - self._adata_ref = None - self._oidx = None - self._vidx = None - x_indices = [] - if filename is not None: - self.file = AnnDataFileManager(self, filename, filemode) - else: - self.file = AnnDataFileManager(self, None) - ( - X, - obs, - var, - uns, - obsm, - varm, - obsp, - varp, - layers, - raw, - x_indices, - ) = self._reformat_axes_args_from_X( - X, obs, var, uns, obsm, varm, obsp, varp, layers, raw - ) - self._assign_X(X, shape, dtype) - - self._initialize_indices(shape, obs, var) - assert self.n_obs == self.shape[0] - assert self.n_vars == self.shape[1] - if self.X is not None: - assert self.n_obs == self.X.shape[0] - assert self.n_vars == self.X.shape[1] - - self._assign_obs(obs) - self._assign_var(var) - # now we can verify if indices match! - for attr_name, x_name, idx in x_indices: - attr = getattr(self, attr_name) - if isinstance(attr.index, pd.RangeIndex): - attr.index = idx - elif not idx.equals(attr.index): - raise ValueError(f"Index of {attr_name} must match {x_name} of X.") - - self._assign_uns(uns) - self._assign_obsm(obsm) - self._assign_varm(varm) - self._assign_obsp(obsp) - self._assign_varp(varp) - self._assign_layers(layers) - self._run_checks() - self._cleanup_raw_and_uns(raw, uns) - - @abstractmethod - def _init_as_view(self, *args, **kwargs): - pass - - @abstractmethod - def _assign_X(self, X, shape, dtype): - pass - - def _reformat_axes_args_from_X(self, *args): - return args - - @abstractmethod - def _initialize_indices(self, *args): - pass - - @abstractmethod - def _assign_obs(self, obs): - pass - - @abstractmethod - def _assign_var(self, var): - pass - - @abstractmethod - def _assign_layers(self, layers): - pass - + @property @abstractmethod - def _assign_uns(self, uns): + def X(self): pass + @X.setter @abstractmethod - def _assign_obsm(self, obsm): + def X(self, X): pass + @property @abstractmethod - def _assign_varm(self, varm): + def obs(self): pass + @obs.setter @abstractmethod - def _assign_obsp(self, obsp): + def obs(self, obs): pass + @property @abstractmethod - def _assign_varp(self, varp): + def obsm(self): pass + @obsm.setter @abstractmethod - def _assign_layers(self, layers): + def obsm(self, obsm): pass + @property @abstractmethod - def _run_checks(self, *args): + def obsp(self): pass + @obsp.setter @abstractmethod - def _cleanup(self, *args): + def obsp(self, obsp): pass @property @abstractmethod - def X(self): + def var(self): pass - @property + @var.setter @abstractmethod - def obs(self): + def var(self, var): pass @property @abstractmethod - def obsm(self): + def uns(self): pass - @property + @uns.setter @abstractmethod - def obsp(self): + def uns(self, uns): pass @property @abstractmethod - def var(self): + def varm(self): pass - @property + @varm.setter @abstractmethod - def uns(self): + def varm(self, varm): pass @property @abstractmethod - def varm(self): + def varp(self): pass - @property + @varp.setter @abstractmethod - def varp(self): + def varp(self, varp): pass @property diff --git a/anndata/_core/raw.py b/anndata/_core/raw.py index 39cee1c4e..c02306791 100644 --- a/anndata/_core/raw.py +++ b/anndata/_core/raw.py @@ -181,6 +181,7 @@ class _RawViewHack: def __init__(self, raw: Raw, vidx: Union[slice, np.ndarray]): self.parent_raw = raw self.vidx = vidx + self.is_view = True @property def shape(self) -> Tuple[int, int]: diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 7f00fa1ed..d7d5034ec 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -2,14 +2,26 @@ from copy import copy from functools import cached_property from pathlib import Path -from typing import Any, MutableMapping, Union, List, Sequence, Tuple +from typing import ( + Any, + Iterable, + Mapping, + MutableMapping, + Optional, + Union, + List, + Sequence, + Tuple, +) +from anndata._core.access import ElementRef from anndata._core.aligned_mapping import Layers, PairwiseArrays from anndata._core.anndata import StorageType, _check_2d_shape, _gen_dataframe +from anndata._core.anndata_base import AbstractAnnData from anndata._core.file_backing import AnnDataFileManager -from anndata._core.index import Index +from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw from anndata._core.sparse_dataset import SparseDataset -from anndata._core.views import _resolve_idxs +from anndata._core.views import _resolve_idxs, as_view from anndata._io.specs.registry import read_elem from anndata.compat import _move_adj_mtx, _read_attr from anndata.utils import convert_to_dict @@ -99,8 +111,51 @@ def __getitem__(self_iloc, idx): return IlocDispatch() -class AnnDataRemote(AnnData): - def _assign_X(self, X, shape, dtype): +class AnnDataRemote(AbstractAnnData): + def __init__( + self, + X=None, + obs=None, + var=None, + uns=None, + obsm=None, + varm=None, + layers=None, + raw=None, + dtype=None, + shape=None, + filename=None, + filemode=None, + *, + obsp=None, + varp=None, + oidx=None, + vidx=None, + ): + self._oidx = oidx + self._vidx = vidx + self._is_view = False + if oidx is not None and vidx is not None: # and or or? + self._is_view = True # hack needed for clean use of views below + # init from AnnData + if issubclass(type(X), AbstractAnnData): + if any((obs, var, uns, obsm, varm, obsp, varp)): + raise ValueError( + "If `X` is a dict no further arguments must be provided." + ) + X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( + X._X, + X.obs, + X.var, + X.uns, + X.obsm, + X.varm, + X.obsp, + X.varp, + X.layers, + X.raw, + ) + if X is not None: for s_type in StorageType: if isinstance(X, s_type.value): @@ -118,121 +173,51 @@ def _assign_X(self, X, shape, dtype): X = X.astype(dtype) # data matrix and shape self._X = X - self._n_obs, self._n_vars = self._X.shape else: self._X = None - def _initialize_indices(self, shape, obs, var): # annotations - need names already for AxisArrays to work. self.obs_names = obs["index"] if "index" in obs else obs["_index"] + if oidx is not None: + self.obs_names = self.obs_names[oidx] self.var_names = var["index"] if "index" in var else var["_index"] - if self._X is not None: - self._n_obs, self._n_vars = self._X.shape - else: - self._n_obs = len([] if obs is None else self.obs_names) - self._n_vars = len([] if var is None else self.var_names) - # check consistency with shape - if shape is not None: - if self._n_obs == 0: - self._n_obs = shape[0] - else: - if self._n_obs != shape[0]: - raise ValueError("`shape` is inconsistent with `obs`") - if self._n_vars == 0: - self._n_vars = shape[1] - else: - if self._n_vars != shape[1]: - raise ValueError("`shape` is inconsistent with `var`") - - # annotations - def _assign_obs(self, obs): - self._obs = AxisArraysRemote(self, 0, vals=convert_to_dict(obs)) - - def _assign_var(self, var): - self._var = AxisArraysRemote(self, 1, vals=convert_to_dict(var)) - - def _assign_obsm(self, obsm): - self._obsm = AxisArraysRemote(self, 0, vals=convert_to_dict(obsm)) - - def _assign_varm(self, varm): - self._varm = AxisArraysRemote(self, 1, vals=convert_to_dict(varm)) - - def _run_checks(self): - pass # for now - - def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): - if adata_ref.isbacked and adata_ref.is_view: - raise ValueError( - "Currently, you cannot index repeatedly into a backed AnnData, " - "that is, you cannot make a view of a view." - ) - self._is_view = True - if isinstance(oidx, (int, np.integer)): - if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): - raise IndexError(f"Observation index `{oidx}` is out of range.") - oidx += adata_ref.n_obs * (oidx < 0) - oidx = slice(oidx, oidx + 1, 1) - if isinstance(vidx, (int, np.integer)): - if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars): - raise IndexError(f"Variable index `{vidx}` is out of range.") - vidx += adata_ref.n_vars * (vidx < 0) - vidx = slice(vidx, vidx + 1, 1) - if adata_ref.is_view: - prev_oidx, prev_vidx = adata_ref._oidx, adata_ref._vidx - adata_ref = adata_ref._adata_ref - oidx, vidx = _resolve_idxs((prev_oidx, prev_vidx), (oidx, vidx), adata_ref) - # self._adata_ref is never a view - self._adata_ref = adata_ref - self._oidx = oidx - self._vidx = vidx - # the file is the same as of the reference object - self.file = adata_ref.file - # views on attributes of adata_ref - self._obsm = adata_ref.obsm._view(self, (oidx,)) - self._varm = adata_ref.varm._view(self, (vidx,)) - self._layers = adata_ref.layers._view(self, (oidx, vidx)) - self._obsp = adata_ref.obsp._view(self, oidx) - self._varp = adata_ref.varp._view(self, vidx) - # fix categories - uns = copy(adata_ref._uns) - # set attributes - self._obs = adata_ref.obs._view(self, (oidx,)) - self._var = adata_ref.var._view(self, (vidx,)) - self._uns = uns - self._n_obs = len( - self.obs["index"] if "index" in self.obs else self.obs["_index"] + if vidx is not None: + self.var_names = self.var_names[vidx] + + adata_ref = self + if self._is_view: + adata_ref = X # seems to work + self.obs = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obs))._view( + self, (oidx,) ) - self._n_vars = len( - self.var["index"] if "index" in self.var else self.var["_index"] + self.var = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(var))._view( + self, (vidx,) ) - # set data - if self.isbacked: - self._X = None + self.obsm = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obsm))._view( + self, (oidx,) + ) + self.varm = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(varm))._view( + self, (vidx,) + ) - # set raw, easy, as it’s immutable anyways... - if adata_ref._raw is not None: - # slicing along variables axis is ignored - self._raw = adata_ref.raw[oidx] - self._raw._adata = self - else: - self._raw = None - - # TODO: this is not quite complete in the original but also here, what do we do about this? - def __delitem__(self, index: Index): - obs, var = self._normalize_indices(index) - # TODO: does this really work? - if not self.isbacked: - del self._X[obs, var] - else: - X = self.file["X"] - del X[obs, var] - self._set_backed("X", X) + self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp))._view( + self, oidx + ) + self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp))._view( + self, vidx + ) + + self.layers = Layers(layers)._view(self, (oidx, vidx)) + self.uns = uns or OrderedDict() def __getitem__(self, index: Index) -> "AnnData": """Returns a sliced view of the object.""" oidx, vidx = self._normalize_indices(index) - return AnnDataRemote(self, oidx=oidx, vidx=vidx, asview=True) + return AnnDataRemote(self, oidx=oidx, vidx=vidx) + + def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: + return _normalize_indices(index, self.obs_names, self.var_names) @property def obs_names(self) -> pd.Index: @@ -250,6 +235,130 @@ def obs_names(self, names: Sequence[str]): def var_names(self, names: Sequence[str]): self._var_names = names + @property + def X(self): + if hasattr(self, f"_X"): + if self.is_view: + return self._X[self._oidx, self._vidx] + return self._X + return None + + @X.setter + def X(self, X): + self._X = X + + @property + def obs(self): + if hasattr(self, f"_obs"): + return self._obs + return None + + @obs.setter + def obs(self, obs): + self._obs = obs + + @property + def obsm(self): + if hasattr(self, f"_obsm"): + return self._obsm + return None + + @obsm.setter + def obsm(self, obsm): + self._obsm = obsm + + @property + def obsp(self): + if hasattr(self, f"_obsp"): + return self._obsp + return None + + @obsp.setter + def obsp(self, obsp): + self._obsp = obsp + + @property + def var(self): + if hasattr(self, f"_var"): + return self._var + return None + + @var.setter + def var(self, var): + self._var = var + + @property + def uns(self): + if hasattr(self, f"_uns"): + return self._uns + return None + + @uns.setter + def uns(self, uns): + self._uns = uns + + @property + def varm(self): + if hasattr(self, f"_varm"): + return self._varm + return None + + @varm.setter + def varm(self, varm): + self._varm = varm + + @property + def varp(self): + if hasattr(self, f"_varp"): + return self._varp + return None + + @varp.setter + def varp(self, varp): + self._varp = varp + + @property + def raw(self): + if hasattr(self, f"_raw"): + return self._raw + return None + + @raw.setter + def raw(self, raw): + self._raw = raw + + @property + def is_view(self) -> bool: + """`True` if object is view of another AnnData object, `False` otherwise.""" + return self._is_view + + @property + def n_vars(self) -> int: + """Number of variables/features.""" + return len(self.var_names) + + @property + def n_obs(self) -> int: + """Number of observations.""" + return len(self.obs_names) + + def __repr__(self): + descr = f"AnnData object with n_obs × n_vars = {self.n_obs} × {self.n_vars}" + for attr in [ + "obs", + "var", + "uns", + "obsm", + "varm", + "layers", + "obsp", + "varp", + ]: + keys = getattr(self, attr).keys() + if len(keys) > 0: + descr += f"\n {attr}: {str(list(keys))[1:-1]}" + return descr + def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: if isinstance(store, Path): From 8ecc510de98393e709a03012e40aa04f0dbd888b Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 29 Mar 2023 17:59:25 +0200 Subject: [PATCH 058/125] (feat): add categorical array view functionality --- anndata/experimental/read_remote/read_remote.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index d7d5034ec..bbe39cfc6 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -87,6 +87,16 @@ def __ne__(self, __o) -> np.ndarray: return ~(self == __o) +@_subset.register(LazyCategoricalArray) +def _subset_lazy_cat(a: LazyCategoricalArray, subset_idx: Index): + return a[subset_idx] + + +@as_view.register(pd.Categorical) +def _subset_lazy_cat(a: pd.Categorical, view_args): + return a + + class AxisArraysRemote(AxisArrays): def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation From 1e919e4d1327a22b3497f585876e320f69ef19f5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 29 Mar 2023 17:59:32 +0200 Subject: [PATCH 059/125] (feat): allow indexing into a view --- anndata/_core/index.py | 4 +- .../experimental/read_remote/read_remote.py | 68 +++++++++++-------- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 859c1bcdd..112e4a1a5 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix, issparse -from ..compat import AwkArray, DaskArray, Index, Index1D +from ..compat import AwkArray, DaskArray, Index, Index1D, ZarrArray def _normalize_indices( @@ -121,6 +121,8 @@ def _subset(a: Union[np.ndarray, pd.DataFrame], subset_idx: Index): # Correcting for indexing behaviour of np.ndarray if all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) + if isinstance(a, ZarrArray): + return a.oindex[subset_idx] return a[subset_idx] diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index bbe39cfc6..57a6d9058 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -14,7 +14,7 @@ Tuple, ) from anndata._core.access import ElementRef -from anndata._core.aligned_mapping import Layers, PairwiseArrays +from anndata._core.aligned_mapping import AlignedMapping, Layers, PairwiseArrays from anndata._core.anndata import StorageType, _check_2d_shape, _gen_dataframe from anndata._core.anndata_base import AbstractAnnData from anndata._core.file_backing import AnnDataFileManager @@ -142,17 +142,26 @@ def __init__( oidx=None, vidx=None, ): - self._oidx = oidx - self._vidx = vidx self._is_view = False if oidx is not None and vidx is not None: # and or or? self._is_view = True # hack needed for clean use of views below + adata_ref = self # init from AnnData if issubclass(type(X), AbstractAnnData): if any((obs, var, uns, obsm, varm, obsp, varp)): raise ValueError( "If `X` is a dict no further arguments must be provided." ) + if X.is_view: + prev_oidx, prev_vidx = X._oidx, X._vidx + self._oidx, self._vidx = _resolve_idxs( + (prev_oidx, prev_vidx), (oidx, vidx), X._X + ) + else: + self._oidx = oidx + self._vidx = vidx + if self._is_view: + adata_ref = X # seems to work X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( X._X, X.obs, @@ -165,6 +174,9 @@ def __init__( X.layers, X.raw, ) + else: + self._oidx = oidx + self._vidx = vidx if X is not None: for s_type in StorageType: @@ -193,32 +205,30 @@ def __init__( self.var_names = var["index"] if "index" in var else var["_index"] if vidx is not None: self.var_names = self.var_names[vidx] - - adata_ref = self - if self._is_view: - adata_ref = X # seems to work - self.obs = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obs))._view( - self, (oidx,) - ) - self.var = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(var))._view( - self, (vidx,) - ) - - self.obsm = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obsm))._view( - self, (oidx,) - ) - self.varm = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(varm))._view( - self, (vidx,) - ) - - self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp))._view( - self, oidx - ) - self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp))._view( - self, vidx - ) - - self.layers = Layers(layers)._view(self, (oidx, vidx)) + self.obs = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obs)) + if self.is_view: + self.obs = self.obs._view(self, (oidx,)) + self.var = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(var)) + if self.is_view: + self.var = self.var._view(self, (vidx,)) + + self.obsm = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obsm)) + if self.is_view: + self.obsm = self.obsm._view(self, (oidx,)) + self.varm = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(varm)) + if self.is_view: + self.varm = self.varm._view(self, (vidx,)) + + self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp)) + if self.is_view: + self.obsp = self.obsp._view(self, oidx) + self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp)) + if self.is_view: + self.varp = self.varp._view(self, vidx) + + self.layers = Layers(layers) + if self.is_view: + self.layers = self.layers._view(self, (oidx, vidx)) self.uns = uns or OrderedDict() def __getitem__(self, index: Index) -> "AnnData": From 03757e43f5d3e4dab640ad2d768ddb45f8c0bbdb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 10:34:16 +0200 Subject: [PATCH 060/125] (chore): remove print statement --- anndata/_io/zarr.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py index f57e12755..6558886dc 100644 --- a/anndata/_io/zarr.py +++ b/anndata/_io/zarr.py @@ -48,7 +48,6 @@ def callback(func, s, k, elem, dataset_kwargs, iospec): func(s, k, elem, dataset_kwargs=dataset_kwargs) write_dispatched(f, "/", adata, callback=callback, dataset_kwargs=ds_kwargs) - print(f) zarr.consolidate_metadata(f.store) From fef1e3842d33beed881d09b0bc16496c61cf7559 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 10:34:46 +0200 Subject: [PATCH 061/125] (style): batch `is_view` checks --- .../experimental/read_remote/read_remote.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 57a6d9058..e1009dbf0 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -206,28 +206,22 @@ def __init__( if vidx is not None: self.var_names = self.var_names[vidx] self.obs = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obs)) - if self.is_view: - self.obs = self.obs._view(self, (oidx,)) self.var = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(var)) - if self.is_view: - self.var = self.var._view(self, (vidx,)) self.obsm = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obsm)) - if self.is_view: - self.obsm = self.obsm._view(self, (oidx,)) self.varm = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(varm)) - if self.is_view: - self.varm = self.varm._view(self, (vidx,)) self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp)) - if self.is_view: - self.obsp = self.obsp._view(self, oidx) self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp)) - if self.is_view: - self.varp = self.varp._view(self, vidx) self.layers = Layers(layers) if self.is_view: + self.obs = self.obs._view(self, (oidx,)) + self.var = self.var._view(self, (vidx,)) + self.obsm = self.obsm._view(self, (oidx,)) + self.varm = self.varm._view(self, (vidx,)) + self.obsp = self.obsp._view(self, oidx) + self.varp = self.varp._view(self, vidx) self.layers = self.layers._view(self, (oidx, vidx)) self.uns = uns or OrderedDict() From 6e592b35f02268f1037dc622034ac7f4b9b15c16 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 11:45:58 +0200 Subject: [PATCH 062/125] (fix): use `sparse_dataset` in remote --- anndata/experimental/read_remote/read_remote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index e1009dbf0..1999d7ddf 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -20,7 +20,7 @@ from anndata._core.file_backing import AnnDataFileManager from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw -from anndata._core.sparse_dataset import SparseDataset +from anndata._core.sparse_dataset import sparse_dataset from anndata._core.views import _resolve_idxs, as_view from anndata._io.specs.registry import read_elem from anndata.compat import _move_adj_mtx, _read_attr @@ -412,7 +412,7 @@ def callback(func, elem_name: str, elem, iospec): elif iospec.encoding_type in {"array", "string-array"}: return elem elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - return SparseDataset(elem).to_backed() + return sparse_dataset(elem).to_backed() return func(elem) adata = read_dispatched(f, callback=callback) From c110b3b1263c21035b0e972c8de6ff86e84d84e0 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 12:24:50 +0200 Subject: [PATCH 063/125] (feat): add support for general index columns --- anndata/experimental/read_remote/read_remote.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 1999d7ddf..d5c1332cf 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -134,8 +134,7 @@ def __init__( raw=None, dtype=None, shape=None, - filename=None, - filemode=None, + file=None, *, obsp=None, varp=None, @@ -199,10 +198,10 @@ def __init__( self._X = None # annotations - need names already for AxisArrays to work. - self.obs_names = obs["index"] if "index" in obs else obs["_index"] + self.obs_names = obs[file["obs"].attrs["_index"]] if oidx is not None: self.obs_names = self.obs_names[oidx] - self.var_names = var["index"] if "index" in var else var["_index"] + self.var_names = var[file["var"].attrs["_index"]] if vidx is not None: self.var_names = self.var_names[vidx] self.obs = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obs)) @@ -394,7 +393,7 @@ def callback(func, elem_name: str, elem, iospec): else [(k, elem[k]) for k in cols if k in elem] ) return AnnDataRemote( - **{k: read_dispatched(v, callback) for k, v in iter_object} + **{k: read_dispatched(v, callback) for k, v in iter_object}, file=elem ) elif elem_name.startswith("/raw"): return None From c50e4f691f142dade254ac09a84f304d951b9922 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 12:26:39 +0200 Subject: [PATCH 064/125] (feat): add support for `raw` --- anndata/_core/anndata_base.py | 10 ++++++++++ anndata/experimental/read_remote/read_remote.py | 17 +++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/anndata/_core/anndata_base.py b/anndata/_core/anndata_base.py index 24d58ca6e..ddd79cccd 100644 --- a/anndata/_core/anndata_base.py +++ b/anndata/_core/anndata_base.py @@ -87,6 +87,16 @@ def varp(self): def varp(self, varp): pass + @property + @abstractmethod + def raw(self): + pass + + @raw.setter + @abstractmethod + def raw(self, raw): + pass + @property def n_obs(self) -> int: return len(self.obs_names) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index d5c1332cf..13e65f9b7 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -224,6 +224,13 @@ def __init__( self.layers = self.layers._view(self, (oidx, vidx)) self.uns = uns or OrderedDict() + if not raw: + self.raw = None + elif isinstance(raw, cabc.Mapping): + self.raw = Raw(self, **raw) + else: # is a Raw from another AnnData + self.raw = Raw(self, raw._X, raw.var, raw.varm) + def __getitem__(self, index: Index) -> "AnnData": """Returns a sliced view of the object.""" oidx, vidx = self._normalize_indices(index) @@ -340,6 +347,16 @@ def raw(self): def raw(self, raw): self._raw = raw + @property + def raw(self): + if hasattr(self, f"_raw"): + return self._raw + return None + + @raw.setter + def raw(self, raw): + self._raw = raw + @property def is_view(self) -> bool: """`True` if object is view of another AnnData object, `False` otherwise.""" From 95f2e3f7979f052e15f812ba00a1921505921763 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 13:00:21 +0200 Subject: [PATCH 065/125] (fix): ensure `file` is always present --- anndata/experimental/read_remote/read_remote.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 13e65f9b7..acc22cd86 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -161,6 +161,8 @@ def __init__( self._vidx = vidx if self._is_view: adata_ref = X # seems to work + if file is None: + file = X.file X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( X._X, X.obs, @@ -198,10 +200,11 @@ def __init__( self._X = None # annotations - need names already for AxisArrays to work. - self.obs_names = obs[file["obs"].attrs["_index"]] + self.file = file + self.obs_names = obs[self.file["obs"].attrs["_index"]] if oidx is not None: self.obs_names = self.obs_names[oidx] - self.var_names = var[file["var"].attrs["_index"]] + self.var_names = var[self.file["var"].attrs["_index"]] if vidx is not None: self.var_names = self.var_names[vidx] self.obs = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obs)) From 0e45fa69ddfa09812b838da249e1698bd520d9ab Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 13:06:23 +0200 Subject: [PATCH 066/125] (feat): add some checks. --- anndata/experimental/read_remote/read_remote.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index acc22cd86..03cac605f 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -234,6 +234,17 @@ def __init__( else: # is a Raw from another AnnData self.raw = Raw(self, raw._X, raw.var, raw.varm) + self._run_checks() + + def _run_checks(self): + assert len(self.obs_names) == self.shape[0] + assert len(self.var_names) == self.shape[1] + assert len(self.obs_names) == self.X.shape[0] + assert len(self.var_names) == self.X.shape[1] + for layer in self.layers: + assert len(self.obs_names) == layer.shape[0] + assert len(self.var_names) == layer.shape[1] + def __getitem__(self, index: Index) -> "AnnData": """Returns a sliced view of the object.""" oidx, vidx = self._normalize_indices(index) From 27849c57388f34922e7b4a27fb54d6f963b9dcbc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 30 Mar 2023 11:10:35 +0000 Subject: [PATCH 067/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/sparse_dataset.py | 1 + .../experimental/read_remote/read_remote.py | 20 +++++++++---------- anndata/tests/test_backed_sparse.py | 8 +++----- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 112d154cc..e9b5e4824 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -392,6 +392,7 @@ class CSRDataset(BaseCompressedSparseDataset): class CSCDataset(BaseCompressedSparseDataset): format_str = "csc" + def sparse_dataset(group) -> BaseCompressedSparseDataset: # encoding_type = _read_attr(group, "encoding-type") encoding_type = _get_group_format(group) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 03cac605f..9f8431d2a 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -271,7 +271,7 @@ def var_names(self, names: Sequence[str]): @property def X(self): - if hasattr(self, f"_X"): + if hasattr(self, "_X"): if self.is_view: return self._X[self._oidx, self._vidx] return self._X @@ -283,7 +283,7 @@ def X(self, X): @property def obs(self): - if hasattr(self, f"_obs"): + if hasattr(self, "_obs"): return self._obs return None @@ -293,7 +293,7 @@ def obs(self, obs): @property def obsm(self): - if hasattr(self, f"_obsm"): + if hasattr(self, "_obsm"): return self._obsm return None @@ -303,7 +303,7 @@ def obsm(self, obsm): @property def obsp(self): - if hasattr(self, f"_obsp"): + if hasattr(self, "_obsp"): return self._obsp return None @@ -313,7 +313,7 @@ def obsp(self, obsp): @property def var(self): - if hasattr(self, f"_var"): + if hasattr(self, "_var"): return self._var return None @@ -323,7 +323,7 @@ def var(self, var): @property def uns(self): - if hasattr(self, f"_uns"): + if hasattr(self, "_uns"): return self._uns return None @@ -333,7 +333,7 @@ def uns(self, uns): @property def varm(self): - if hasattr(self, f"_varm"): + if hasattr(self, "_varm"): return self._varm return None @@ -343,7 +343,7 @@ def varm(self, varm): @property def varp(self): - if hasattr(self, f"_varp"): + if hasattr(self, "_varp"): return self._varp return None @@ -353,7 +353,7 @@ def varp(self, varp): @property def raw(self): - if hasattr(self, f"_raw"): + if hasattr(self, "_raw"): return self._raw return None @@ -363,7 +363,7 @@ def raw(self, raw): @property def raw(self): - if hasattr(self, f"_raw"): + if hasattr(self, "_raw"): return self._raw return None diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index cc3ba2608..83863fcc4 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -41,6 +41,7 @@ def ondisk_equivalent_adata(tmp_path, diskfmt): csc_disk = ad.read_h5ad(csc_path, backed="r") dense_disk = ad.read_h5ad(dense_path, backed="r") else: + def read_zarr_backed(path): path = str(path) @@ -50,10 +51,7 @@ def read_zarr_backed(path): def callback(func, elem_name, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): return AnnData( - **{ - k: read_dispatched(v, callback) - for k, v in elem.items() - } + **{k: read_dispatched(v, callback) for k, v in elem.items()} ) if iospec.encoding_type in {"csc_matrix", "csr_matrix"}: return sparse_dataset(elem).to_backed() @@ -176,4 +174,4 @@ def test_wrong_formats(tmp_path): post_checks = disk_mtx.to_memory() # Check nothing changed - assert not np.any((pre_checks != post_checks).toarray()) \ No newline at end of file + assert not np.any((pre_checks != post_checks).toarray()) From cec72bc9f1099ece48511ebbcef34be80bb53bfb Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 30 Mar 2023 13:32:29 +0200 Subject: [PATCH 068/125] (feat): add `indptr` caching --- anndata/_core/sparse_dataset.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 112d154cc..57985fa30 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -46,6 +46,8 @@ class BackedSparseMatrix(_cs_matrix): since that calls copy on `.data`, `.indices`, and `.indptr`. """ + _cached_indptr = None + def copy(self) -> ss.spmatrix: if isinstance(self.data, h5py.Dataset): return sparse_dataset(self.data.parent).to_memory() @@ -116,6 +118,17 @@ def _offsets( ) return offsets + @property + def indptr(self): + if self._cached_indptr is None: + self._cached_indptr = self._indptr[:] + return self._cached_indptr + + @indptr.setter + def indptr(self, indptr): + self._indptr = indptr + self._cached_indptr = None + class backed_csr_matrix(BackedSparseMatrix, ss.csr_matrix): def _get_intXslice(self, row: int, col: slice) -> ss.csr_matrix: @@ -373,7 +386,7 @@ def to_backed(self) -> BackedSparseMatrix: mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"] mtx.indices = self.group["indices"] - mtx.indptr = self.group["indptr"][:] + mtx.indptr = self.group["indptr"] return mtx def to_memory(self) -> ss.spmatrix: @@ -392,6 +405,7 @@ class CSRDataset(BaseCompressedSparseDataset): class CSCDataset(BaseCompressedSparseDataset): format_str = "csc" + def sparse_dataset(group) -> BaseCompressedSparseDataset: # encoding_type = _read_attr(group, "encoding-type") encoding_type = _get_group_format(group) From ff20982d342c0beddc60dbe30c85392fd1c41bb9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Apr 2023 12:21:29 +0200 Subject: [PATCH 069/125] (fix): don't use `.X` in `__init__` --- anndata/experimental/read_remote/read_remote.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 9f8431d2a..d5a9cb3c6 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -239,8 +239,6 @@ def __init__( def _run_checks(self): assert len(self.obs_names) == self.shape[0] assert len(self.var_names) == self.shape[1] - assert len(self.obs_names) == self.X.shape[0] - assert len(self.var_names) == self.X.shape[1] for layer in self.layers: assert len(self.obs_names) == layer.shape[0] assert len(self.var_names) == layer.shape[1] @@ -273,7 +271,7 @@ def var_names(self, names: Sequence[str]): def X(self): if hasattr(self, "_X"): if self.is_view: - return self._X[self._oidx, self._vidx] + return _subset(self._X, (self._oidx, self._vidx)) return self._X return None From 8d315be2483a18c45e44cff4be4d026ced36410a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Apr 2023 12:21:36 +0200 Subject: [PATCH 070/125] (feat): use dask for raw arrays --- anndata/experimental/read_remote/read_remote.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index d5a9cb3c6..0191bd26e 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -30,6 +30,7 @@ import pandas as pd import numpy as np from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin +import dask.array as da from ..._core import AnnData, AxisArrays from .. import read_dispatched @@ -438,7 +439,7 @@ def callback(func, elem_name: str, elem, iospec): elif iospec.encoding_type == "categorical": return LazyCategoricalArray(elem) elif iospec.encoding_type in {"array", "string-array"}: - return elem + return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: return sparse_dataset(elem).to_backed() return func(elem) From 772b96810badea1ca3922385cc160e88cbb4ef10 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Apr 2023 15:57:05 +0200 Subject: [PATCH 071/125] (fix): clean categories --- anndata/experimental/read_remote/read_remote.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 0191bd26e..5cf5f6966 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -76,7 +76,7 @@ def __getitem__(self, selection) -> pd.Categorical: codes=codes, categories=self.categories, ordered=self.ordered, - ) + ).remove_unused_categories() def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" From 53b28a8e0b903836234247952e4280c57a4de4ac Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Apr 2023 15:57:46 +0200 Subject: [PATCH 072/125] (fix): 1d axis array view \`to_df\` --- .../experimental/read_remote/read_remote.py | 41 ++++++++++++++----- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 5cf5f6966..134076c1c 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -14,7 +14,12 @@ Tuple, ) from anndata._core.access import ElementRef -from anndata._core.aligned_mapping import AlignedMapping, Layers, PairwiseArrays +from anndata._core.aligned_mapping import ( + AlignedMapping, + Layers, + PairwiseArrays, + AxisArraysView, +) from anndata._core.anndata import StorageType, _check_2d_shape, _gen_dataframe from anndata._core.anndata_base import AbstractAnnData from anndata._core.file_backing import AnnDataFileManager @@ -105,14 +110,6 @@ def __getattr__(self, __name: str): return self.to_df().__getattribute__(__name) return object.__getattribute__(self, __name) - def to_df(self) -> pd.DataFrame: - """Convert to pandas dataframe.""" - df = pd.DataFrame(index=self.dim_names) - for key in self.keys(): - if "index" not in key: - df[key] = self[key][()] - return df - @property def iloc(self): class IlocDispatch: @@ -122,6 +119,28 @@ def __getitem__(self_iloc, idx): return IlocDispatch() +def to_df_1d_axis_arrays(axis_arrays): + """Convert to pandas dataframe.""" + df = pd.DataFrame(index=axis_arrays.dim_names) + for key in axis_arrays.keys(): + if "index" not in key: + df[key] = axis_arrays[key][()] + return df + + +class AxisArrays1dRemote(AxisArraysRemote): + def to_df(self) -> pd.DataFrame: + return to_df_1d_axis_arrays(self) + + +class AxisArraysRemoteView(AxisArraysView): + def to_df(self) -> pd.DataFrame: + return to_df_1d_axis_arrays(self) + + +AxisArrays1dRemote._view_class = AxisArraysRemoteView + + class AnnDataRemote(AbstractAnnData): def __init__( self, @@ -208,8 +227,8 @@ def __init__( self.var_names = var[self.file["var"].attrs["_index"]] if vidx is not None: self.var_names = self.var_names[vidx] - self.obs = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obs)) - self.var = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(var)) + self.obs = AxisArrays1dRemote(adata_ref, 0, vals=convert_to_dict(obs)) + self.var = AxisArrays1dRemote(adata_ref, 1, vals=convert_to_dict(var)) self.obsm = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obsm)) self.varm = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(varm)) From cc76538fd225caee4b3e143377dcbd172c026b8d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Apr 2023 15:58:33 +0200 Subject: [PATCH 073/125] (chore): tests --- anndata/tests/test_read_remote.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/anndata/tests/test_read_remote.py b/anndata/tests/test_read_remote.py index 73d1fbd93..cfd5dedf7 100644 --- a/anndata/tests/test_read_remote.py +++ b/anndata/tests/test_read_remote.py @@ -99,6 +99,17 @@ def test_read_write_full(adata, tmp_path): assert (adata.var == remote.var.to_df()).all().all() +def test_read_write_view(adata, tmp_path): + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + adata.write_zarr(orig_pth) + remote = read_remote(orig_pth) + subset = adata.obs["oanno1"] == "cat1" + assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X)) + assert (adata[subset, :].obs == remote[subset, :].obs.to_df()).all().all() + assert (adata[subset, :].var == remote[subset, :].var.to_df()).all().all() + + def test_lazy_categorical_array_properties(categorical_zarr_group): arr = LazyCategoricalArray(categorical_zarr_group) assert len(arr[0:3]) == 3 From e338ce1a45478e2ee77044e63aec86e9182c0219 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 4 May 2023 13:56:35 +0200 Subject: [PATCH 074/125] (feat): lazy subset mechanism for lazy cat array --- .../experimental/read_remote/read_remote.py | 45 ++++++++++++++++--- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index 134076c1c..f9fd70009 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -1,5 +1,5 @@ from collections import OrderedDict, abc as cabc -from copy import copy +from copy import copy, deepcopy from functools import cached_property from pathlib import Path from typing import ( @@ -26,7 +26,7 @@ from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw from anndata._core.sparse_dataset import sparse_dataset -from anndata._core.views import _resolve_idxs, as_view +from anndata._core.views import _resolve_idx, as_view, _resolve_idxs from anndata._io.specs.registry import read_elem from anndata.compat import _move_adj_mtx, _read_attr from anndata.utils import convert_to_dict @@ -42,7 +42,7 @@ class LazyCategoricalArray(ExplicitlyIndexedNDArrayMixin): - __slots__ = ("codes", "attrs", "_categories", "_categories_cache") + __slots__ = ("codes", "attrs", "_categories", "_categories_cache", "_subset_idx") def __init__(self, group, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group @@ -53,6 +53,7 @@ def __init__(self, group, *args, **kwargs): self.codes = group["codes"] self._categories = group["categories"] self._categories_cache = None + self._subset_idx = None self.attrs = dict(group.attrs) @property @@ -61,20 +62,45 @@ def categories(self): # __slots__ and cached_property are incompatible self._categories_cache = self._categories[...] return self._categories_cache + @property + def subset_idx(self): + return self._subset_idx + + @subset_idx.setter + def subset_idx(self, new_idx): + idx = ( + new_idx + if self._subset_idx is None + else _resolve_idx(self._subset_idx, new_idx, self.shape[0]) + ) + self._subset_idx = idx + @property def dtype(self) -> pd.CategoricalDtype: return pd.CategoricalDtype(self.categories, self.ordered) @property def shape(self) -> Tuple[int, ...]: - return self.codes.shape + if self.subset_idx is None: + return self.codes.shape + if isinstance(self.subset_idx, slice): + if self.subset_idx == slice(None, None, None): + return self.codes.shape + return (slice.stop - slice.start,) + else: + return (len(self.subset_idx),) @property def ordered(self): return bool(self.attrs["ordered"]) def __getitem__(self, selection) -> pd.Categorical: - codes = self.codes.oindex[selection] + idx = ( + selection + if self.subset_idx is None + else _resolve_idx(self.subset_idx, selection, self.shape[0]) + ) + codes = self.codes.oindex[idx] if codes.shape == (): # handle 0d case codes = np.array([codes]) return pd.Categorical.from_codes( @@ -95,7 +121,9 @@ def __ne__(self, __o) -> np.ndarray: @_subset.register(LazyCategoricalArray) def _subset_lazy_cat(a: LazyCategoricalArray, subset_idx: Index): - return a[subset_idx] + a_copy = deepcopy(a) + a_copy.subset_idx = subset_idx[0] # this is a tuple? + return a_copy @as_view.register(pd.Categorical) @@ -103,6 +131,11 @@ def _subset_lazy_cat(a: pd.Categorical, view_args): return a +@as_view.register(LazyCategoricalArray) +def _subset_lazy_cat(a: LazyCategoricalArray, view_args): + return a + + class AxisArraysRemote(AxisArrays): def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation From 823349609aacf131c0aeb56ee83fb98ab11c35e9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 4 May 2023 13:56:49 +0200 Subject: [PATCH 075/125] (feat): finish all dtypes --- anndata/_core/sparse_dataset.py | 10 +- .../experimental/read_remote/read_remote.py | 152 +++++++++++++----- anndata/tests/test_read_remote.py | 124 ++++++++------ 3 files changed, 197 insertions(+), 89 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 57985fa30..79a46620c 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -13,15 +13,17 @@ from abc import ABC import collections.abc as cabc from itertools import accumulate, chain +from pathlib import Path from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type from warnings import warn import h5py +import zarr import numpy as np import scipy.sparse as ss from scipy.sparse import _sparsetools -from ..compat import _read_attr +from ..compat import _read_attr, ZarrArray try: # Not really important, just for IDEs to be more helpful @@ -51,6 +53,12 @@ class BackedSparseMatrix(_cs_matrix): def copy(self) -> ss.spmatrix: if isinstance(self.data, h5py.Dataset): return sparse_dataset(self.data.parent).to_memory() + if isinstance(self.data, ZarrArray): + return sparse_dataset( + zarr.open( + store=self.data.store, path=Path(self.data.path).parent, mode="r" + ) + ).to_memory() else: return super().copy() diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index f9fd70009..aa405cb21 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -1,5 +1,6 @@ from collections import OrderedDict, abc as cabc from copy import copy, deepcopy +from enum import Enum from functools import cached_property from pathlib import Path from typing import ( @@ -41,8 +42,42 @@ from .. import read_dispatched -class LazyCategoricalArray(ExplicitlyIndexedNDArrayMixin): - __slots__ = ("codes", "attrs", "_categories", "_categories_cache", "_subset_idx") +class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): + def _resolve_idx(self, new_idx): + return ( + new_idx + if self.subset_idx is None + else _resolve_idx(self.subset_idx, new_idx, self.shape[0]) + ) + + @property + def subset_idx(self): + return self._subset_idx + + @subset_idx.setter + def subset_idx(self, new_idx): + self._subset_idx = self._resolve_idx(new_idx) + + @property + def shape(self) -> Tuple[int, ...]: + if self.subset_idx is None: + return self.values.shape + if isinstance(self.subset_idx, slice): + if self.subset_idx == slice(None, None, None): + return self.values.shape + return (self.subset_idx.stop - self.subset_idx.start,) + else: + return (len(self.subset_idx),) + + def __eq__(self, __o) -> np.ndarray: + return self[()] == __o + + def __ne__(self, __o) -> np.ndarray: + return ~(self == __o) + + +class LazyCategoricalArray(MaskedArrayMixIn): + __slots__ = ("values", "attrs", "_categories", "_categories_cache", "_subset_idx") def __init__(self, group, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group @@ -50,7 +85,7 @@ def __init__(self, group, *args, **kwargs): Args: group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr """ - self.codes = group["codes"] + self.values = group["codes"] self._categories = group["categories"] self._categories_cache = None self._subset_idx = None @@ -62,45 +97,17 @@ def categories(self): # __slots__ and cached_property are incompatible self._categories_cache = self._categories[...] return self._categories_cache - @property - def subset_idx(self): - return self._subset_idx - - @subset_idx.setter - def subset_idx(self, new_idx): - idx = ( - new_idx - if self._subset_idx is None - else _resolve_idx(self._subset_idx, new_idx, self.shape[0]) - ) - self._subset_idx = idx - @property def dtype(self) -> pd.CategoricalDtype: return pd.CategoricalDtype(self.categories, self.ordered) - @property - def shape(self) -> Tuple[int, ...]: - if self.subset_idx is None: - return self.codes.shape - if isinstance(self.subset_idx, slice): - if self.subset_idx == slice(None, None, None): - return self.codes.shape - return (slice.stop - slice.start,) - else: - return (len(self.subset_idx),) - @property def ordered(self): return bool(self.attrs["ordered"]) def __getitem__(self, selection) -> pd.Categorical: - idx = ( - selection - if self.subset_idx is None - else _resolve_idx(self.subset_idx, selection, self.shape[0]) - ) - codes = self.codes.oindex[idx] + idx = self._resolve_idx(selection) + codes = self.values.oindex[idx] if codes.shape == (): # handle 0d case codes = np.array([codes]) return pd.Categorical.from_codes( @@ -112,27 +119,78 @@ def __getitem__(self, selection) -> pd.Categorical: def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" - def __eq__(self, __o) -> np.ndarray: - return self[()] == __o - def __ne__(self, __o) -> np.ndarray: - return ~(self == __o) +class LazyMaskedArray(MaskedArrayMixIn): + __slots__ = ("mask", "values", "_subset_idx", "_dtype_str") + def __init__(self, group, dtype_str, *args, **kwargs): + """Class for lazily reading categorical data from formatted zarr group -@_subset.register(LazyCategoricalArray) -def _subset_lazy_cat(a: LazyCategoricalArray, subset_idx: Index): + Args: + group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr + dtype_str (Nullable): group containing "codes" and "categories" key as well as "ordered" attr + """ + self.values = group["values"] + self.mask = group["mask"] if "mask" in group else None + self._subset_idx = None + self._dtype_str = dtype_str + + @property + def dtype(self) -> pd.CategoricalDtype: + if self.mask is not None: + if self._dtype_str == "nullable-integer": + return pd.arrays.IntegerArray + elif self._dtype_str == "nullable-boolean": + return pd.arrays.BooleanArray + return pd.array + + def __getitem__(self, selection) -> pd.Categorical: + idx = self._resolve_idx(selection) + values = self.values[idx] + if self.mask is not None: + mask = self.mask[idx] + if self._dtype_str == "nullable-integer": + return pd.arrays.IntegerArray(values, mask=mask) + elif self._dtype_str == "nullable-boolean": + return pd.arrays.BooleanArray(values, mask=mask) + return pd.array(values) + + def __repr__(self) -> str: + if self._dtype_str == "nullable-integer": + return f"LazyNullableIntegerArray" + elif self._dtype_str == "nullable-boolean": + return f"LazyNullableBooleanArray" + + +@_subset.register(MaskedArrayMixIn) +def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): a_copy = deepcopy(a) a_copy.subset_idx = subset_idx[0] # this is a tuple? return a_copy +@as_view.register(MaskedArrayMixIn) +def _view_masked(a: MaskedArrayMixIn, view_args): + return a + + @as_view.register(pd.Categorical) -def _subset_lazy_cat(a: pd.Categorical, view_args): +def _view_pd_categorical(a: pd.Categorical, view_args): return a -@as_view.register(LazyCategoricalArray) -def _subset_lazy_cat(a: LazyCategoricalArray, view_args): +@as_view.register(pd.api.extensions.ExtensionArray) +def _view_pd_array(a: pd.api.extensions.ExtensionArray, view_args): + return a + + +@as_view.register(pd.arrays.IntegerArray) +def _view_pd_integer_array(a: pd.arrays.IntegerArray, view_args): + return a + + +@as_view.register(pd.arrays.BooleanArray) +def _view_pd_boolean_array(a: pd.arrays.BooleanArray, view_args): return a @@ -151,6 +209,10 @@ def __getitem__(self_iloc, idx): return IlocDispatch() + @property + def dim_names(self) -> pd.Index: + return (self.parent.obs_names, self.parent.var_names)[self._axis].compute() + def to_df_1d_axis_arrays(axis_arrays): """Convert to pandas dataframe.""" @@ -490,10 +552,16 @@ def callback(func, elem_name: str, elem, iospec): return {k: read_dispatched(v, callback) for k, v in iter_object} elif iospec.encoding_type == "categorical": return LazyCategoricalArray(elem) + elif "nullable" in iospec.encoding_type: + return LazyMaskedArray(elem, iospec.encoding_type) elif iospec.encoding_type in {"array", "string-array"}: return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: return sparse_dataset(elem).to_backed() + elif iospec.encoding_type in {"awkward-array"}: + return read_dispatched(elem, None) + elif iospec.encoding_type in {"dataframe"}: + return read_dispatched(elem, None) return func(elem) adata = read_dispatched(f, callback=callback) diff --git a/anndata/tests/test_read_remote.py b/anndata/tests/test_read_remote.py index cfd5dedf7..345c22dda 100644 --- a/anndata/tests/test_read_remote.py +++ b/anndata/tests/test_read_remote.py @@ -6,48 +6,14 @@ from scipy import sparse import zarr -import anndata as ad from anndata.tests.helpers import ( as_dense_dask_array, + gen_adata, subset_func, ) from anndata.experimental.read_remote import read_remote, LazyCategoricalArray from anndata.utils import asarray -subset_func2 = subset_func -# ------------------------------------------------------------------------------- -# Some test data -# ------------------------------------------------------------------------------- - - -@pytest.fixture -def adata(): - X_list = [ - [1, 2, 3], - [4, 5, 6], - [7, 8, 9], - ] # data matrix of shape n_obs x n_vars - X = np.array(X_list) - obs_dict = dict( # annotation of observations / rows - row_names=["name1", "name2", "name3"], # row annotation - oanno1=["cat1", "cat2", "cat2"], # categorical annotation - oanno2=["o1", "o2", "o3"], # string annotation - oanno3=[2.1, 2.2, 2.3], # float annotation - ) - var_dict = dict(vanno1=[3.1, 3.2, 3.3]) # annotation of variables / columns - uns_dict = dict( # unstructured annotation - oanno1_colors=["#000000", "#FFFFFF"], uns2=["some annotation"] - ) - return ad.AnnData( - X, - obs=obs_dict, - var=var_dict, - uns=uns_dict, - obsm=dict(o1=np.zeros((X.shape[0], 10))), - varm=dict(v1=np.ones((X.shape[1], 20))), - layers=dict(float=X.astype(float), sparse=sparse.csr_matrix(X)), - ) - @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], @@ -66,7 +32,7 @@ def sparse_format(request): def categorical_zarr_group(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") - z["codes"] = [0, 1, 0, 1, 1, 2, 2] + z["codes"] = [0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2] z["categories"] = ["foo", "bar", "jazz"] z.attrs["ordered"] = False z = zarr.open(base_path) @@ -78,36 +44,87 @@ def test_read_write_X(tmp_path, mtx_format): orig_pth = base_pth / "orig.zarr" # remote_pth = base_pth / "backed.zarr" - orig = ad.AnnData(mtx_format(asarray(sparse.random(10, 10, format="csr")))) + orig = gen_adata((1000, 1000), mtx_format) orig.write_zarr(orig_pth) remote = read_remote(orig_pth) # remote.write_zarr(remote_pth) # need to implement writing! assert np.all(asarray(orig.X) == asarray(remote.X)) - assert (orig.obs == remote.obs.to_df()).all().all() - assert (orig.var == remote.var.to_df()).all().all() + assert (orig.obs == remote.obs.to_df()[orig.obs.columns]).all().all() + assert (orig.var == remote.var.to_df()[orig.var.columns]).all().all() + assert (orig.obsm["array"] == remote.obsm["array"].compute()).all() -def test_read_write_full(adata, tmp_path): +def test_read_write_full(tmp_path, mtx_format): + adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) remote = read_remote(orig_pth) assert np.all(asarray(adata.X) == asarray(remote.X)) - assert (adata.obs == remote.obs.to_df()).all().all() - assert (adata.var == remote.var.to_df()).all().all() + assert (adata.obs == remote.obs.to_df()[adata.obs.columns]).all().all() + assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() + assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() -def test_read_write_view(adata, tmp_path): +def test_read_write_view(tmp_path, mtx_format): + adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) remote = read_remote(orig_pth) - subset = adata.obs["oanno1"] == "cat1" + subset = adata.obs["obs_cat"] == "a" assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X)) - assert (adata[subset, :].obs == remote[subset, :].obs.to_df()).all().all() - assert (adata[subset, :].var == remote[subset, :].var.to_df()).all().all() + assert ( + (adata[subset, :].obs == remote[subset, :].obs.to_df()[adata.obs.columns]) + .all() + .all() + ) + assert ( + (adata[subset, :].var == remote[subset, :].var.to_df()[adata.var.columns]) + .all() + .all() + ) + assert ( + adata[subset, :].obsm["array"] == remote[subset, :].obsm["array"].compute() + ).all() + + +def test_read_write_view_of_view(tmp_path, mtx_format): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + adata.write_zarr(orig_pth) + remote = read_remote(orig_pth) + subset = (adata.obs["obs_cat"] == "a") | (adata.obs["obs_cat"] == "b") + subsetted_adata = adata[subset, :] + subset_subset = subsetted_adata.obs["obs_cat"] == "b" + subsetted_subsetted_adata = subsetted_adata[subset_subset, :] + assert np.all( + asarray(subsetted_subsetted_adata.X) + == asarray(remote[subset, :][subset_subset, :].X) + ) + assert ( + ( + subsetted_subsetted_adata.obs + == remote[subset, :][subset_subset, :].obs.to_df()[adata.obs.columns] + ) + .all() + .all() + ) + assert ( + ( + subsetted_subsetted_adata.var + == remote[subset, :][subset_subset, :].var.to_df()[adata.var.columns] + ) + .all() + .all() + ) + assert ( + subsetted_subsetted_adata.obsm["array"] + == remote[subset, :][subset_subset, :].obsm["array"].compute() + ).all() def test_lazy_categorical_array_properties(categorical_zarr_group): @@ -123,3 +140,18 @@ def test_lazy_categorical_array_equality(categorical_zarr_group): assert (arr[0] == "foo").all() assert (arr[3:5] == "bar").all() assert (arr == "foo").any() + + +def test_lazy_categorical_array_subset_subset(categorical_zarr_group): + arr = LazyCategoricalArray(categorical_zarr_group) + subset_susbet = arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.Categorical + assert ( + subset_susbet[()] + == pd.Categorical.from_codes( + codes=[2, 2, 1, 2, 0], + categories=["foo", "bar", "jazz"], + ordered=False, + ).remove_unused_categories() + ).all() From 05423c3219929c897f3c6328569d10e5f885e224 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 4 May 2023 12:26:44 +0000 Subject: [PATCH 076/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/experimental/read_remote/read_remote.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index aa405cb21..36efb4b99 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -157,9 +157,9 @@ def __getitem__(self, selection) -> pd.Categorical: def __repr__(self) -> str: if self._dtype_str == "nullable-integer": - return f"LazyNullableIntegerArray" + return "LazyNullableIntegerArray" elif self._dtype_str == "nullable-boolean": - return f"LazyNullableBooleanArray" + return "LazyNullableBooleanArray" @_subset.register(MaskedArrayMixIn) From 1f592b193af013125ea5079eb224778d70e6d1cf Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 5 May 2023 11:52:22 +0200 Subject: [PATCH 077/125] (fix): use base compressed class directly --- anndata/_core/anndata.py | 2 ++ anndata/experimental/read_remote/read_remote.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 1e9259366..dc956e91b 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -54,6 +54,7 @@ DaskArray, _move_adj_mtx, ) +from .sparse_dataset import BaseCompressedSparseDataset class StorageType(Enum): @@ -63,6 +64,7 @@ class StorageType(Enum): ZarrArray = ZarrArray ZappyArray = ZappyArray DaskArray = DaskArray + BaseCompressedSparseDataset = BaseCompressedSparseDataset @classmethod def classes(cls): diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_remote/read_remote.py index aa405cb21..00e4a2813 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_remote/read_remote.py @@ -157,9 +157,9 @@ def __getitem__(self, selection) -> pd.Categorical: def __repr__(self) -> str: if self._dtype_str == "nullable-integer": - return f"LazyNullableIntegerArray" + return "LazyNullableIntegerArray" elif self._dtype_str == "nullable-boolean": - return f"LazyNullableBooleanArray" + return "LazyNullableBooleanArray" @_subset.register(MaskedArrayMixIn) @@ -557,7 +557,7 @@ def callback(func, elem_name: str, elem, iospec): elif iospec.encoding_type in {"array", "string-array"}: return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - return sparse_dataset(elem).to_backed() + return sparse_dataset(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) elif iospec.encoding_type in {"dataframe"}: From a22be078a5545e46d29d24ad5e6f52d3d7dbec51 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 May 2023 14:53:38 +0200 Subject: [PATCH 078/125] (chore): `read_remote` -> `read_backed` --- anndata/experimental/read_backed/__init__.py | 1 + .../read_remote.py => read_backed/read_backed.py} | 2 +- anndata/experimental/read_remote/__init__.py | 1 - ...read_remote.py => test_read_backed_experimental.py} | 10 +++++----- 4 files changed, 7 insertions(+), 7 deletions(-) create mode 100644 anndata/experimental/read_backed/__init__.py rename anndata/experimental/{read_remote/read_remote.py => read_backed/read_backed.py} (99%) delete mode 100644 anndata/experimental/read_remote/__init__.py rename anndata/tests/{test_read_remote.py => test_read_backed_experimental.py} (95%) diff --git a/anndata/experimental/read_backed/__init__.py b/anndata/experimental/read_backed/__init__.py new file mode 100644 index 000000000..e4f1a6c90 --- /dev/null +++ b/anndata/experimental/read_backed/__init__.py @@ -0,0 +1 @@ +from .read_backed import read_backed, LazyCategoricalArray diff --git a/anndata/experimental/read_remote/read_remote.py b/anndata/experimental/read_backed/read_backed.py similarity index 99% rename from anndata/experimental/read_remote/read_remote.py rename to anndata/experimental/read_backed/read_backed.py index 00e4a2813..6d212a4e9 100644 --- a/anndata/experimental/read_remote/read_remote.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -517,7 +517,7 @@ def __repr__(self): return descr -def read_remote(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: +def read_backed(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: if isinstance(store, Path): store = str(store) diff --git a/anndata/experimental/read_remote/__init__.py b/anndata/experimental/read_remote/__init__.py deleted file mode 100644 index 54657205f..000000000 --- a/anndata/experimental/read_remote/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .read_remote import read_remote, LazyCategoricalArray diff --git a/anndata/tests/test_read_remote.py b/anndata/tests/test_read_backed_experimental.py similarity index 95% rename from anndata/tests/test_read_remote.py rename to anndata/tests/test_read_backed_experimental.py index 345c22dda..d4db47a62 100644 --- a/anndata/tests/test_read_remote.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -11,7 +11,7 @@ gen_adata, subset_func, ) -from anndata.experimental.read_remote import read_remote, LazyCategoricalArray +from anndata.experimental.read_backed import read_backed, LazyCategoricalArray from anndata.utils import asarray @@ -47,7 +47,7 @@ def test_read_write_X(tmp_path, mtx_format): orig = gen_adata((1000, 1000), mtx_format) orig.write_zarr(orig_pth) - remote = read_remote(orig_pth) + remote = read_backed(orig_pth) # remote.write_zarr(remote_pth) # need to implement writing! assert np.all(asarray(orig.X) == asarray(remote.X)) @@ -61,7 +61,7 @@ def test_read_write_full(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) - remote = read_remote(orig_pth) + remote = read_backed(orig_pth) assert np.all(asarray(adata.X) == asarray(remote.X)) assert (adata.obs == remote.obs.to_df()[adata.obs.columns]).all().all() assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() @@ -73,7 +73,7 @@ def test_read_write_view(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) - remote = read_remote(orig_pth) + remote = read_backed(orig_pth) subset = adata.obs["obs_cat"] == "a" assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X)) assert ( @@ -96,7 +96,7 @@ def test_read_write_view_of_view(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) - remote = read_remote(orig_pth) + remote = read_backed(orig_pth) subset = (adata.obs["obs_cat"] == "a") | (adata.obs["obs_cat"] == "b") subsetted_adata = adata[subset, :] subset_subset = subsetted_adata.obs["obs_cat"] == "b" From 70b4bfac33f149540847ca1eebe495b88f2ef3e7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 8 May 2023 15:24:43 +0200 Subject: [PATCH 079/125] (chore); add nullable bool/int tests --- anndata/experimental/read_backed/__init__.py | 2 +- .../experimental/read_backed/read_backed.py | 8 +- .../tests/test_read_backed_experimental.py | 135 +++++++++++++++++- 3 files changed, 138 insertions(+), 7 deletions(-) diff --git a/anndata/experimental/read_backed/__init__.py b/anndata/experimental/read_backed/__init__.py index e4f1a6c90..52e2e899f 100644 --- a/anndata/experimental/read_backed/__init__.py +++ b/anndata/experimental/read_backed/__init__.py @@ -1 +1 @@ -from .read_backed import read_backed, LazyCategoricalArray +from .read_backed import read_backed, LazyCategoricalArray, LazyMaskedArray diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 6d212a4e9..f186d7005 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -128,7 +128,7 @@ def __init__(self, group, dtype_str, *args, **kwargs): Args: group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr - dtype_str (Nullable): group containing "codes" and "categories" key as well as "ordered" attr + dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` """ self.values = group["values"] self.mask = group["mask"] if "mask" in group else None @@ -146,9 +146,11 @@ def dtype(self) -> pd.CategoricalDtype: def __getitem__(self, selection) -> pd.Categorical: idx = self._resolve_idx(selection) - values = self.values[idx] + if type(idx) == int: + idx = slice(idx, idx + 1) + values = np.array(self.values[idx]) if self.mask is not None: - mask = self.mask[idx] + mask = np.array(self.mask[idx]) if self._dtype_str == "nullable-integer": return pd.arrays.IntegerArray(values, mask=mask) elif self._dtype_str == "nullable-boolean": diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index d4db47a62..bd86347b9 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -11,7 +11,7 @@ gen_adata, subset_func, ) -from anndata.experimental.read_backed import read_backed, LazyCategoricalArray +from anndata.experimental.read_backed import read_backed, LazyCategoricalArray, LazyMaskedArray from anndata.utils import asarray @@ -32,12 +32,46 @@ def sparse_format(request): def categorical_zarr_group(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") - z["codes"] = [0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2] - z["categories"] = ["foo", "bar", "jazz"] + z["codes"] =np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) + z["categories"] = np.array(["foo", "bar", "jazz"]) z.attrs["ordered"] = False z = zarr.open(base_path) return z +@pytest.fixture() +def nullable_boolean_zarr_group(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] =np.array([True, False, True, False, False, True, False, False, True, True, False, False, False, True, False, True]) + z["mask"] = np.array([True, True, True, True, True, False, False, True, False, True, True, True, True, False, True, False]) + z = zarr.open(base_path) + return z + +@pytest.fixture() +def nullable_boolean_zarr_group_no_mask(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] = np.array([True, False, True, False, False, True, False, False, True, True, False, False, False, True, False, True]) + z = zarr.open(base_path) + return z + +@pytest.fixture() +def nullable_integer_zarr_group(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) + z["mask"] = np.array([True, True, True, True, True, False, False, True, False, True, True, True, True, False, True, False]) + z = zarr.open(base_path) + return z + +@pytest.fixture() +def nullable_integer_zarr_group_no_mask(tmp_path_factory): + base_path = tmp_path_factory.getbasetemp() + z = zarr.open_group(base_path, mode="w") + z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) + z = zarr.open(base_path) + return z + def test_read_write_X(tmp_path, mtx_format): base_pth = Path(tmp_path) @@ -155,3 +189,98 @@ def test_lazy_categorical_array_subset_subset(categorical_zarr_group): ordered=False, ).remove_unused_categories() ).all() + + +def test_nullable_boolean_array_properties(nullable_boolean_zarr_group): + arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') + assert len(arr[0:3]) == 3 + assert type(arr[0:3]) == pd.arrays.BooleanArray + assert len(arr[()]) == len(arr) + assert type(arr[()]) == pd.arrays.BooleanArray + + +def test_nullable_boolean_array_equality(nullable_boolean_zarr_group): + arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') + assert (arr[0] == pd.NA).all() + assert (arr[3:5] == pd.NA).all() + assert (arr[5:7] == np.array([True, False])).all() + + +def test_nullable_boolean_array_subset_subset(nullable_boolean_zarr_group): + arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') + subset_susbet = arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.BooleanArray + assert ( + subset_susbet[()] + == pd.arrays.BooleanArray( + values=np.array([True, False, False, True, True]), + mask=np.array([False, False, True, False, True]), + ) + ).all() + +def test_nullable_boolean_array_no_mask_equality(nullable_boolean_zarr_group_no_mask): + arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, 'nullable-boolean') + assert (arr[0] == True).all() + assert (arr[3:5] == False).all() + assert (arr[5:7] == np.array([True, False])).all() + + +def test_nullable_boolean_array_no_mask_subset_subset(nullable_boolean_zarr_group_no_mask): + arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, 'nullable-boolean') + subset_susbet = arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.BooleanArray + assert ( + subset_susbet[()] + == pd.array(np.array([True, False, False, True, True]),) + ).all() + +def test_nullable_integer_array_properties(nullable_integer_zarr_group): + arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') + assert len(arr[0:3]) == 3 + assert type(arr[0:3]) == pd.arrays.IntegerArray + assert len(arr[()]) == len(arr) + assert type(arr[()]) == pd.arrays.IntegerArray + + +def test_nullable_integer_array_equality(nullable_integer_zarr_group): + arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') + assert (arr[0] == pd.NA).all() + assert (arr[3:5] == pd.NA).all() + assert (arr[5:7] == np.array([2, 2])).all() + + +def test_nullable_integer_array_subset_subset(nullable_integer_zarr_group): + arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') + subset_susbet = arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.IntegerArray + assert ( + subset_susbet[()] + == pd.arrays.IntegerArray( + values=np.array([2, 2, 1, 2, 0]), + mask=np.array([False, False, True, False, True]), + ) + ).all() + +def test_nullable_integer_array_no_mask_equality(nullable_integer_zarr_group_no_mask): + arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, 'nullable-integer') + assert (arr[0] == pd.NA).all() + assert (arr[3:5] == 1).all() + assert (arr[5:7] == np.array([2, 2])).all() + + +def test_nullable_integer_array_no_mask_subset_subset(nullable_integer_zarr_group_no_mask): + arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, 'nullable-integer') + subset_susbet = arr[0:10][5:10] + assert len(subset_susbet) == 5 + assert type(subset_susbet) == pd.arrays.IntegerArray + assert ( + subset_susbet[()] + == pd.array(np.array([2, 2, 1, 2, 0]),) + ).all() + + + + From 65a0bc2ce0081ff3d4072463109cbdfa170d160c Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 May 2023 11:51:54 +0200 Subject: [PATCH 080/125] (feat): reorganize dirs --- anndata/experimental/read_backed/__init__.py | 3 +- .../experimental/read_backed/lazy_arrays.py | 161 ++++++++++++ .../read_backed/lazy_axis_arrays.py | 54 ++++ .../experimental/read_backed/read_backed.py | 234 ++---------------- 4 files changed, 231 insertions(+), 221 deletions(-) create mode 100644 anndata/experimental/read_backed/lazy_arrays.py create mode 100644 anndata/experimental/read_backed/lazy_axis_arrays.py diff --git a/anndata/experimental/read_backed/__init__.py b/anndata/experimental/read_backed/__init__.py index 52e2e899f..564c711d5 100644 --- a/anndata/experimental/read_backed/__init__.py +++ b/anndata/experimental/read_backed/__init__.py @@ -1 +1,2 @@ -from .read_backed import read_backed, LazyCategoricalArray, LazyMaskedArray +from .read_backed import read_backed +from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray \ No newline at end of file diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py new file mode 100644 index 000000000..6692d467b --- /dev/null +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -0,0 +1,161 @@ +from copy import deepcopy +from typing import Tuple +from anndata._core.index import Index, _subset +from anndata._core.views import _resolve_idx, as_view + +import pandas as pd +import numpy as np +from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin + + +class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): + def _resolve_idx(self, new_idx): + return ( + new_idx + if self.subset_idx is None + else _resolve_idx(self.subset_idx, new_idx, self.shape[0]) + ) + + @property + def subset_idx(self): + return self._subset_idx + + @subset_idx.setter + def subset_idx(self, new_idx): + self._subset_idx = self._resolve_idx(new_idx) + + @property + def shape(self) -> Tuple[int, ...]: + if self.subset_idx is None: + return self.values.shape + if isinstance(self.subset_idx, slice): + if self.subset_idx == slice(None, None, None): + return self.values.shape + return (self.subset_idx.stop - self.subset_idx.start,) + else: + return (len(self.subset_idx),) + + def __eq__(self, __o) -> np.ndarray: + return self[()] == __o + + def __ne__(self, __o) -> np.ndarray: + return ~(self == __o) + + +class LazyCategoricalArray(MaskedArrayMixIn): + __slots__ = ("values", "attrs", "_categories", "_categories_cache", "_subset_idx") + + def __init__(self, group, *args, **kwargs): + """Class for lazily reading categorical data from formatted zarr group + + Args: + group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr + """ + self.values = group["codes"] + self._categories = group["categories"] + self._categories_cache = None + self._subset_idx = None + self.attrs = dict(group.attrs) + + @property + def categories(self): # __slots__ and cached_property are incompatible + if self._categories_cache is None: + self._categories_cache = self._categories[...] + return self._categories_cache + + @property + def dtype(self) -> pd.CategoricalDtype: + return pd.CategoricalDtype(self.categories, self.ordered) + + @property + def ordered(self): + return bool(self.attrs["ordered"]) + + def __getitem__(self, selection) -> pd.Categorical: + idx = self._resolve_idx(selection) + codes = self.values.oindex[idx] + if codes.shape == (): # handle 0d case + codes = np.array([codes]) + return pd.Categorical.from_codes( + codes=codes, + categories=self.categories, + ordered=self.ordered, + ).remove_unused_categories() + + def __repr__(self) -> str: + return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" + + +class LazyMaskedArray(MaskedArrayMixIn): + __slots__ = ("mask", "values", "_subset_idx", "_dtype_str") + + def __init__(self, group, dtype_str, *args, **kwargs): + """Class for lazily reading categorical data from formatted zarr group + + Args: + group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr + dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` + """ + self.values = group["values"] + self.mask = group["mask"] if "mask" in group else None + self._subset_idx = None + self._dtype_str = dtype_str + + @property + def dtype(self) -> pd.CategoricalDtype: + if self.mask is not None: + if self._dtype_str == "nullable-integer": + return pd.arrays.IntegerArray + elif self._dtype_str == "nullable-boolean": + return pd.arrays.BooleanArray + return pd.array + + def __getitem__(self, selection) -> pd.Categorical: + idx = self._resolve_idx(selection) + if type(idx) == int: + idx = slice(idx, idx + 1) + values = np.array(self.values[idx]) + if self.mask is not None: + mask = np.array(self.mask[idx]) + if self._dtype_str == "nullable-integer": + return pd.arrays.IntegerArray(values, mask=mask) + elif self._dtype_str == "nullable-boolean": + return pd.arrays.BooleanArray(values, mask=mask) + return pd.array(values) + + def __repr__(self) -> str: + if self._dtype_str == "nullable-integer": + return "LazyNullableIntegerArray" + elif self._dtype_str == "nullable-boolean": + return "LazyNullableBooleanArray" + +@_subset.register(MaskedArrayMixIn) +def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): + a_copy = deepcopy(a) + a_copy.subset_idx = subset_idx[0] # this is a tuple? + return a_copy + + +@as_view.register(MaskedArrayMixIn) +def _view_masked(a: MaskedArrayMixIn, view_args): + return a + + +@as_view.register(pd.Categorical) +def _view_pd_categorical(a: pd.Categorical, view_args): + return a + + +@as_view.register(pd.api.extensions.ExtensionArray) +def _view_pd_array(a: pd.api.extensions.ExtensionArray, view_args): + return a + + +@as_view.register(pd.arrays.IntegerArray) +def _view_pd_integer_array(a: pd.arrays.IntegerArray, view_args): + return a + + +@as_view.register(pd.arrays.BooleanArray) +def _view_pd_boolean_array(a: pd.arrays.BooleanArray, view_args): + return a \ No newline at end of file diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py new file mode 100644 index 000000000..3999df9bd --- /dev/null +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -0,0 +1,54 @@ +from anndata._core.aligned_mapping import AxisArraysView + +import pandas as pd +import numpy as np + +from ..._core import AxisArrays + + +class AxisArraysRemote(AxisArrays): + def __getattr__(self, __name: str): + # If we a method has been accessed that is not here, try the pandas implementation + if hasattr(pd.DataFrame, __name): + return self.to_df().__getattribute__(__name) + return object.__getattribute__(self, __name) + + @property + def iloc(self): + class IlocDispatch: + def __getitem__(self_iloc, idx): + return self._view(self.parent, (idx,)) + + return IlocDispatch() + + @property + def dim_names(self) -> pd.Index: + return (self.parent.obs_names, self.parent.var_names)[self._axis].compute() + + +def to_df_1d_axis_arrays(axis_arrays, idx=None): + """Convert to pandas dataframe.""" + df = pd.DataFrame(index=axis_arrays.dim_names[() if idx is None else idx]) + for key in axis_arrays.keys(): + if "index" not in key: + df[key] = axis_arrays[key][() if idx is None else idx] + return df + + +class AxisArrays1dRemote(AxisArraysRemote): + def to_df(self) -> pd.DataFrame: + return to_df_1d_axis_arrays(self) + + +class AxisArraysRemoteView(AxisArraysView): + def to_df(self) -> pd.DataFrame: + return to_df_1d_axis_arrays(self) + + def __getattr__(self, __name: str): + # If we a method has been accessed that is not here, try the pandas implementation + if hasattr(pd.DataFrame, __name): + return self.to_df().__getattribute__(__name) + return object.__getattribute__(self, __name) + + +AxisArrays1dRemote._view_class = AxisArraysRemoteView \ No newline at end of file diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index f186d7005..bf5eb35c3 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -1,244 +1,35 @@ from collections import OrderedDict, abc as cabc -from copy import copy, deepcopy -from enum import Enum -from functools import cached_property from pathlib import Path from typing import ( - Any, - Iterable, - Mapping, MutableMapping, Optional, Union, - List, Sequence, Tuple, ) -from anndata._core.access import ElementRef from anndata._core.aligned_mapping import ( - AlignedMapping, Layers, PairwiseArrays, - AxisArraysView, ) -from anndata._core.anndata import StorageType, _check_2d_shape, _gen_dataframe +from anndata._core.anndata import StorageType, _check_2d_shape from anndata._core.anndata_base import AbstractAnnData -from anndata._core.file_backing import AnnDataFileManager -from anndata._core.index import Index, _normalize_indices, _subset +from anndata._core.index import Index, _normalize_indices, _subset, get_vector from anndata._core.raw import Raw from anndata._core.sparse_dataset import sparse_dataset -from anndata._core.views import _resolve_idx, as_view, _resolve_idxs -from anndata._io.specs.registry import read_elem -from anndata.compat import _move_adj_mtx, _read_attr +from anndata._core.views import _resolve_idxs from anndata.utils import convert_to_dict import zarr import pandas as pd -import numpy as np -from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin import dask.array as da - -from ..._core import AnnData, AxisArrays +from xarray import DataArray +from ..._core import AnnData from .. import read_dispatched +from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray +from .lazy_axis_arrays import AxisArrays1dRemote, AxisArraysRemote -class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): - def _resolve_idx(self, new_idx): - return ( - new_idx - if self.subset_idx is None - else _resolve_idx(self.subset_idx, new_idx, self.shape[0]) - ) - - @property - def subset_idx(self): - return self._subset_idx - - @subset_idx.setter - def subset_idx(self, new_idx): - self._subset_idx = self._resolve_idx(new_idx) - - @property - def shape(self) -> Tuple[int, ...]: - if self.subset_idx is None: - return self.values.shape - if isinstance(self.subset_idx, slice): - if self.subset_idx == slice(None, None, None): - return self.values.shape - return (self.subset_idx.stop - self.subset_idx.start,) - else: - return (len(self.subset_idx),) - - def __eq__(self, __o) -> np.ndarray: - return self[()] == __o - - def __ne__(self, __o) -> np.ndarray: - return ~(self == __o) - - -class LazyCategoricalArray(MaskedArrayMixIn): - __slots__ = ("values", "attrs", "_categories", "_categories_cache", "_subset_idx") - - def __init__(self, group, *args, **kwargs): - """Class for lazily reading categorical data from formatted zarr group - - Args: - group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr - """ - self.values = group["codes"] - self._categories = group["categories"] - self._categories_cache = None - self._subset_idx = None - self.attrs = dict(group.attrs) - - @property - def categories(self): # __slots__ and cached_property are incompatible - if self._categories_cache is None: - self._categories_cache = self._categories[...] - return self._categories_cache - - @property - def dtype(self) -> pd.CategoricalDtype: - return pd.CategoricalDtype(self.categories, self.ordered) - - @property - def ordered(self): - return bool(self.attrs["ordered"]) - - def __getitem__(self, selection) -> pd.Categorical: - idx = self._resolve_idx(selection) - codes = self.values.oindex[idx] - if codes.shape == (): # handle 0d case - codes = np.array([codes]) - return pd.Categorical.from_codes( - codes=codes, - categories=self.categories, - ordered=self.ordered, - ).remove_unused_categories() - - def __repr__(self) -> str: - return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" - - -class LazyMaskedArray(MaskedArrayMixIn): - __slots__ = ("mask", "values", "_subset_idx", "_dtype_str") - - def __init__(self, group, dtype_str, *args, **kwargs): - """Class for lazily reading categorical data from formatted zarr group - - Args: - group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr - dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` - """ - self.values = group["values"] - self.mask = group["mask"] if "mask" in group else None - self._subset_idx = None - self._dtype_str = dtype_str - - @property - def dtype(self) -> pd.CategoricalDtype: - if self.mask is not None: - if self._dtype_str == "nullable-integer": - return pd.arrays.IntegerArray - elif self._dtype_str == "nullable-boolean": - return pd.arrays.BooleanArray - return pd.array - - def __getitem__(self, selection) -> pd.Categorical: - idx = self._resolve_idx(selection) - if type(idx) == int: - idx = slice(idx, idx + 1) - values = np.array(self.values[idx]) - if self.mask is not None: - mask = np.array(self.mask[idx]) - if self._dtype_str == "nullable-integer": - return pd.arrays.IntegerArray(values, mask=mask) - elif self._dtype_str == "nullable-boolean": - return pd.arrays.BooleanArray(values, mask=mask) - return pd.array(values) - - def __repr__(self) -> str: - if self._dtype_str == "nullable-integer": - return "LazyNullableIntegerArray" - elif self._dtype_str == "nullable-boolean": - return "LazyNullableBooleanArray" - - -@_subset.register(MaskedArrayMixIn) -def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): - a_copy = deepcopy(a) - a_copy.subset_idx = subset_idx[0] # this is a tuple? - return a_copy - - -@as_view.register(MaskedArrayMixIn) -def _view_masked(a: MaskedArrayMixIn, view_args): - return a - - -@as_view.register(pd.Categorical) -def _view_pd_categorical(a: pd.Categorical, view_args): - return a - - -@as_view.register(pd.api.extensions.ExtensionArray) -def _view_pd_array(a: pd.api.extensions.ExtensionArray, view_args): - return a - - -@as_view.register(pd.arrays.IntegerArray) -def _view_pd_integer_array(a: pd.arrays.IntegerArray, view_args): - return a - - -@as_view.register(pd.arrays.BooleanArray) -def _view_pd_boolean_array(a: pd.arrays.BooleanArray, view_args): - return a - - -class AxisArraysRemote(AxisArrays): - def __getattr__(self, __name: str): - # If we a method has been accessed that is not here, try the pandas implementation - if hasattr(pd.DataFrame, __name): - return self.to_df().__getattribute__(__name) - return object.__getattribute__(self, __name) - - @property - def iloc(self): - class IlocDispatch: - def __getitem__(self_iloc, idx): - return self._view(self.parent, (idx,)) - - return IlocDispatch() - - @property - def dim_names(self) -> pd.Index: - return (self.parent.obs_names, self.parent.var_names)[self._axis].compute() - - -def to_df_1d_axis_arrays(axis_arrays): - """Convert to pandas dataframe.""" - df = pd.DataFrame(index=axis_arrays.dim_names) - for key in axis_arrays.keys(): - if "index" not in key: - df[key] = axis_arrays[key][()] - return df - - -class AxisArrays1dRemote(AxisArraysRemote): - def to_df(self) -> pd.DataFrame: - return to_df_1d_axis_arrays(self) - - -class AxisArraysRemoteView(AxisArraysView): - def to_df(self) -> pd.DataFrame: - return to_df_1d_axis_arrays(self) - - -AxisArrays1dRemote._view_class = AxisArraysRemoteView - - -class AnnDataRemote(AbstractAnnData): +class AnnDataBacked(AbstractAnnData): def __init__( self, X=None, @@ -353,6 +144,9 @@ def __init__( self._run_checks() + def _sanitize(self): + pass + def _run_checks(self): assert len(self.obs_names) == self.shape[0] assert len(self.var_names) == self.shape[1] @@ -363,7 +157,7 @@ def _run_checks(self): def __getitem__(self, index: Index) -> "AnnData": """Returns a sliced view of the object.""" oidx, vidx = self._normalize_indices(index) - return AnnDataRemote(self, oidx=oidx, vidx=vidx) + return AnnDataBacked(self, oidx=oidx, vidx=vidx) def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: return _normalize_indices(index, self.obs_names, self.var_names) @@ -502,7 +296,7 @@ def n_obs(self) -> int: return len(self.obs_names) def __repr__(self): - descr = f"AnnData object with n_obs × n_vars = {self.n_obs} × {self.n_vars}" + descr = f"AnnDataBacked object with n_obs × n_vars = {self.n_obs} × {self.n_vars}" for attr in [ "obs", "var", @@ -538,7 +332,7 @@ def callback(func, elem_name: str, elem, iospec): if is_consolidated else [(k, elem[k]) for k in cols if k in elem] ) - return AnnDataRemote( + return AnnDataBacked( **{k: read_dispatched(v, callback) for k, v in iter_object}, file=elem ) elif elem_name.startswith("/raw"): From 1c03335ddf9e472dd449ce92ac548f61c2e00004 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 May 2023 13:46:24 +0200 Subject: [PATCH 081/125] (chore): add access tracking test --- .../tests/test_read_backed_experimental.py | 61 +++++++++++++++---- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index bd86347b9..14ad24695 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -5,15 +5,38 @@ import pandas as pd from scipy import sparse import zarr +from anndata._core.anndata import AnnData from anndata.tests.helpers import ( as_dense_dask_array, gen_adata, - subset_func, + gen_typed_df, ) from anndata.experimental.read_backed import read_backed, LazyCategoricalArray, LazyMaskedArray from anndata.utils import asarray +from zarr import DirectoryStore +import traceback +class AccessTrackingStore(DirectoryStore): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._access_count = {} + + def __getitem__(self, key): + print(key) + for tracked in self._access_count: + if tracked in key: + self._access_count[tracked] += 1 + return super().__getitem__(key) + + def get_access_count(self, key): + return self._access_count[key] + + def set_key_trackers(self, keys_to_track): + for k in keys_to_track: + self._access_count[k] = 0 + @pytest.fixture( params=[sparse.csr_matrix, sparse.csc_matrix, np.array, as_dense_dask_array], @@ -73,21 +96,33 @@ def nullable_integer_zarr_group_no_mask(tmp_path_factory): return z -def test_read_write_X(tmp_path, mtx_format): +def test_access_count_tracked(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" - # remote_pth = base_pth / "backed.zarr" - - orig = gen_adata((1000, 1000), mtx_format) + M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access + N = 5 + obs_names = pd.Index(f"cell{i}" for i in range(M)) + var_names = pd.Index(f"gene{i}" for i in range(N)) + obs = gen_typed_df(M, obs_names) + var = gen_typed_df(N, var_names) + orig = AnnData(obs=obs, var=var, X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32))) orig.write_zarr(orig_pth) - - remote = read_backed(orig_pth) - # remote.write_zarr(remote_pth) # need to implement writing! - - assert np.all(asarray(orig.X) == asarray(remote.X)) - assert (orig.obs == remote.obs.to_df()[orig.obs.columns]).all().all() - assert (orig.var == remote.var.to_df()[orig.var.columns]).all().all() - assert (orig.obsm["array"] == remote.obsm["array"].compute()).all() + store = AccessTrackingStore(orig_pth) + store.set_key_trackers(["obs/int64", "var/int64"]) + remote = read_backed(store) + # a series of methods that should __not__ read in any data + remote.X + remote.shape + remote.var + remote.obs + remote.obs['int64'] + remote.var['int64'] + assert store.get_access_count("obs/int64") == 0 + assert store.get_access_count("var/int64") == 0 + remote[0:10, :].obs['int64'][0:10].compute() + assert store.get_access_count("obs/int64") == 1 # one for 0, .zmetadata handles .zarray + assert store.get_access_count("var/int64") == 0 # never accessed + def test_read_write_full(tmp_path, mtx_format): From dc107ad4604b963f66d1c8df4ce74c0bf8d410d2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 May 2023 13:57:58 +0200 Subject: [PATCH 082/125] (feat): add more array tests --- .../tests/test_read_backed_experimental.py | 28 +++++++++++++++++-- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 14ad24695..6b97c9b76 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -96,7 +96,7 @@ def nullable_integer_zarr_group_no_mask(tmp_path_factory): return z -def test_access_count_tracked(tmp_path, mtx_format): +def test_access_count_obs_var(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access @@ -108,7 +108,7 @@ def test_access_count_tracked(tmp_path, mtx_format): orig = AnnData(obs=obs, var=var, X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32))) orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) - store.set_key_trackers(["obs/int64", "var/int64"]) + store.set_key_trackers(["obs/int64", "var/int64", "obs/cat/codes"]) remote = read_backed(store) # a series of methods that should __not__ read in any data remote.X @@ -117,12 +117,34 @@ def test_access_count_tracked(tmp_path, mtx_format): remote.obs remote.obs['int64'] remote.var['int64'] + # only the `cat` should be read in + subset = remote[remote.obs['cat'] == 'a', :] + subset.obs['int64'] assert store.get_access_count("obs/int64") == 0 assert store.get_access_count("var/int64") == 0 + assert store.get_access_count("obs/cat/codes") == 4 # entire thing needs to be read in for subset remote[0:10, :].obs['int64'][0:10].compute() assert store.get_access_count("obs/int64") == 1 # one for 0, .zmetadata handles .zarray assert store.get_access_count("var/int64") == 0 # never accessed - + +def test_access_count_obsp_varp_layers(tmp_path): + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + M = 1000 + N = 1000 + orig = gen_adata((M, N), mtx_format) + orig.write_zarr(orig_pth) + store = AccessTrackingStore(orig_pth) + store.set_key_trackers(["obsp", "varp", "layers"]) + remote = read_backed(store) + # these operations should not read in any data + subset = remote[0:10, 500:600] + subset.obsp['array'] + subset.varp['array'] + subset.layers['array'] + assert store.get_access_count("obsp") == 0 + assert store.get_access_count("varp") == 0 + assert store.get_access_count("layers") == 0 def test_read_write_full(tmp_path, mtx_format): From 7aae638439c0c5d97f3a383abc2a22a51dfd6ead Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 May 2023 14:38:05 +0200 Subject: [PATCH 083/125] (fix): repr in jupyter setting + refactor --- anndata/_core/aligned_mapping.py | 2 +- .../read_backed/lazy_axis_arrays.py | 46 +++++++++++-------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 8e0777b01..b5271d2c3 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -43,7 +43,7 @@ class AlignedMapping(cabc.MutableMapping, ABC): """The actual class (which has it’s own data) for this aligned mapping.""" def __repr__(self): - return f"{type(self).__name__} with keys: {', '.join(self.keys())}" + return f"{type(self).__name__} with keys: {', '.join([k + f'[{str(self[k].dtype)}]' for k in self.keys()])}" def _ipython_key_completions_(self) -> List[str]: return list(self.keys()) diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 3999df9bd..52e036aff 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -1,4 +1,6 @@ -from anndata._core.aligned_mapping import AxisArraysView +from typing import Mapping, Union +from anndata._core import anndata, raw +from anndata._core.aligned_mapping import AxisArraysBase, AxisArraysView import pandas as pd import numpy as np @@ -7,25 +9,13 @@ class AxisArraysRemote(AxisArrays): - def __getattr__(self, __name: str): - # If we a method has been accessed that is not here, try the pandas implementation - if hasattr(pd.DataFrame, __name): - return self.to_df().__getattribute__(__name) - return object.__getattribute__(self, __name) - - @property - def iloc(self): - class IlocDispatch: - def __getitem__(self_iloc, idx): - return self._view(self.parent, (idx,)) - - return IlocDispatch() @property def dim_names(self) -> pd.Index: return (self.parent.obs_names, self.parent.var_names)[self._axis].compute() + def to_df_1d_axis_arrays(axis_arrays, idx=None): """Convert to pandas dataframe.""" df = pd.DataFrame(index=axis_arrays.dim_names[() if idx is None else idx]) @@ -34,21 +24,39 @@ def to_df_1d_axis_arrays(axis_arrays, idx=None): df[key] = axis_arrays[key][() if idx is None else idx] return df +class AxisArraysRemote1dMixin(): -class AxisArrays1dRemote(AxisArraysRemote): def to_df(self) -> pd.DataFrame: return to_df_1d_axis_arrays(self) + @property + def iloc(self): + class IlocDispatch: + def __getitem__(self_iloc, idx): + return self._view(self.parent, (idx,)) -class AxisArraysRemoteView(AxisArraysView): - def to_df(self) -> pd.DataFrame: - return to_df_1d_axis_arrays(self) + return IlocDispatch() def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation if hasattr(pd.DataFrame, __name): return self.to_df().__getattribute__(__name) return object.__getattribute__(self, __name) + + def _repr_html_(self): + return self.__repr__() + + def _repr_latex_(self): + return self.__repr__() + + +class AxisArrays1dRemote(AxisArraysRemote1dMixin, AxisArraysRemote): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + +class AxisArrays1dRemoteView(AxisArraysRemote1dMixin, AxisArraysView): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) -AxisArrays1dRemote._view_class = AxisArraysRemoteView \ No newline at end of file +AxisArrays1dRemote._view_class = AxisArrays1dRemoteView \ No newline at end of file From dfa6f52d574cee1b0bd3e211cb3179bc252ff900 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 May 2023 14:45:04 +0200 Subject: [PATCH 084/125] (fix): fix layers support --- anndata/experimental/read_backed/read_backed.py | 6 +----- anndata/tests/test_read_backed_experimental.py | 7 ++----- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index bf5eb35c3..a4535975b 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -123,8 +123,7 @@ def __init__( self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp)) self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp)) - - self.layers = Layers(layers) + self.layers = Layers(adata_ref, layers) if self.is_view: self.obs = self.obs._view(self, (oidx,)) self.var = self.var._view(self, (vidx,)) @@ -150,9 +149,6 @@ def _sanitize(self): def _run_checks(self): assert len(self.obs_names) == self.shape[0] assert len(self.var_names) == self.shape[1] - for layer in self.layers: - assert len(self.obs_names) == layer.shape[0] - assert len(self.var_names) == layer.shape[1] def __getitem__(self, index: Index) -> "AnnData": """Returns a sliced view of the object.""" diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 6b97c9b76..ce0abc7e3 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -24,7 +24,6 @@ def __init__(self, *args, **kwargs): self._access_count = {} def __getitem__(self, key): - print(key) for tracked in self._access_count: if tracked in key: self._access_count[tracked] += 1 @@ -127,7 +126,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): assert store.get_access_count("obs/int64") == 1 # one for 0, .zmetadata handles .zarray assert store.get_access_count("var/int64") == 0 # never accessed -def test_access_count_obsp_varp_layers(tmp_path): +def test_access_count_obsp_varp(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" M = 1000 @@ -135,16 +134,14 @@ def test_access_count_obsp_varp_layers(tmp_path): orig = gen_adata((M, N), mtx_format) orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) - store.set_key_trackers(["obsp", "varp", "layers"]) + store.set_key_trackers(["obsp", "varp"]) remote = read_backed(store) # these operations should not read in any data subset = remote[0:10, 500:600] subset.obsp['array'] subset.varp['array'] - subset.layers['array'] assert store.get_access_count("obsp") == 0 assert store.get_access_count("varp") == 0 - assert store.get_access_count("layers") == 0 def test_read_write_full(tmp_path, mtx_format): From d856f9b9da34975d2f60eea852051fc14b6c35db Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 9 May 2023 15:28:56 +0200 Subject: [PATCH 085/125] (fix): add columns attr + cleanup --- anndata/experimental/read_backed/lazy_axis_arrays.py | 6 +++++- anndata/experimental/read_backed/read_backed.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 52e036aff..6fc5a5c7a 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -1,4 +1,4 @@ -from typing import Mapping, Union +from typing import Mapping, Union, List from anndata._core import anndata, raw from anndata._core.aligned_mapping import AxisArraysBase, AxisArraysView @@ -13,6 +13,10 @@ class AxisArraysRemote(AxisArrays): @property def dim_names(self) -> pd.Index: return (self.parent.obs_names, self.parent.var_names)[self._axis].compute() + + @property + def columns(self) -> List: + return list(self.keys()) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index a4535975b..322b12677 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -7,13 +7,14 @@ Sequence, Tuple, ) + from anndata._core.aligned_mapping import ( Layers, PairwiseArrays, ) from anndata._core.anndata import StorageType, _check_2d_shape from anndata._core.anndata_base import AbstractAnnData -from anndata._core.index import Index, _normalize_indices, _subset, get_vector +from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw from anndata._core.sparse_dataset import sparse_dataset from anndata._core.views import _resolve_idxs @@ -22,7 +23,6 @@ import zarr import pandas as pd import dask.array as da -from xarray import DataArray from ..._core import AnnData from .. import read_dispatched from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray From 57b87a2727a0820e4f29464cd441abddbdfb5be7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 9 May 2023 13:41:13 +0000 Subject: [PATCH 086/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/experimental/read_backed/__init__.py | 2 +- .../experimental/read_backed/lazy_arrays.py | 3 +- .../read_backed/lazy_axis_arrays.py | 15 +- .../experimental/read_backed/read_backed.py | 4 +- .../tests/test_read_backed_experimental.py | 179 ++++++++++++++---- 5 files changed, 153 insertions(+), 50 deletions(-) diff --git a/anndata/experimental/read_backed/__init__.py b/anndata/experimental/read_backed/__init__.py index 564c711d5..563c67aa9 100644 --- a/anndata/experimental/read_backed/__init__.py +++ b/anndata/experimental/read_backed/__init__.py @@ -1,2 +1,2 @@ from .read_backed import read_backed -from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray \ No newline at end of file +from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 6692d467b..88124c022 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -129,6 +129,7 @@ def __repr__(self) -> str: elif self._dtype_str == "nullable-boolean": return "LazyNullableBooleanArray" + @_subset.register(MaskedArrayMixIn) def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): a_copy = deepcopy(a) @@ -158,4 +159,4 @@ def _view_pd_integer_array(a: pd.arrays.IntegerArray, view_args): @as_view.register(pd.arrays.BooleanArray) def _view_pd_boolean_array(a: pd.arrays.BooleanArray, view_args): - return a \ No newline at end of file + return a diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 6fc5a5c7a..9494b523e 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -9,17 +9,15 @@ class AxisArraysRemote(AxisArrays): - @property def dim_names(self) -> pd.Index: return (self.parent.obs_names, self.parent.var_names)[self._axis].compute() - + @property def columns(self) -> List: return list(self.keys()) - def to_df_1d_axis_arrays(axis_arrays, idx=None): """Convert to pandas dataframe.""" df = pd.DataFrame(index=axis_arrays.dim_names[() if idx is None else idx]) @@ -28,8 +26,8 @@ def to_df_1d_axis_arrays(axis_arrays, idx=None): df[key] = axis_arrays[key][() if idx is None else idx] return df -class AxisArraysRemote1dMixin(): +class AxisArraysRemote1dMixin: def to_df(self) -> pd.DataFrame: return to_df_1d_axis_arrays(self) @@ -40,16 +38,16 @@ def __getitem__(self_iloc, idx): return self._view(self.parent, (idx,)) return IlocDispatch() - + def __getattr__(self, __name: str): # If we a method has been accessed that is not here, try the pandas implementation if hasattr(pd.DataFrame, __name): return self.to_df().__getattribute__(__name) return object.__getattribute__(self, __name) - + def _repr_html_(self): return self.__repr__() - + def _repr_latex_(self): return self.__repr__() @@ -63,4 +61,5 @@ class AxisArrays1dRemoteView(AxisArraysRemote1dMixin, AxisArraysView): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) -AxisArrays1dRemote._view_class = AxisArrays1dRemoteView \ No newline at end of file + +AxisArrays1dRemote._view_class = AxisArrays1dRemoteView diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 322b12677..a78dfe989 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -292,7 +292,9 @@ def n_obs(self) -> int: return len(self.obs_names) def __repr__(self): - descr = f"AnnDataBacked object with n_obs × n_vars = {self.n_obs} × {self.n_vars}" + descr = ( + f"AnnDataBacked object with n_obs × n_vars = {self.n_obs} × {self.n_vars}" + ) for attr in [ "obs", "var", diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index ce0abc7e3..8ad8914d9 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -12,13 +12,18 @@ gen_adata, gen_typed_df, ) -from anndata.experimental.read_backed import read_backed, LazyCategoricalArray, LazyMaskedArray +from anndata.experimental.read_backed import ( + read_backed, + LazyCategoricalArray, + LazyMaskedArray, +) from anndata.utils import asarray from zarr import DirectoryStore import traceback -class AccessTrackingStore(DirectoryStore): + +class AccessTrackingStore(DirectoryStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._access_count = {} @@ -54,38 +59,118 @@ def sparse_format(request): def categorical_zarr_group(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") - z["codes"] =np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) + z["codes"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) z["categories"] = np.array(["foo", "bar", "jazz"]) z.attrs["ordered"] = False z = zarr.open(base_path) return z + @pytest.fixture() def nullable_boolean_zarr_group(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") - z["values"] =np.array([True, False, True, False, False, True, False, False, True, True, False, False, False, True, False, True]) - z["mask"] = np.array([True, True, True, True, True, False, False, True, False, True, True, True, True, False, True, False]) + z["values"] = np.array( + [ + True, + False, + True, + False, + False, + True, + False, + False, + True, + True, + False, + False, + False, + True, + False, + True, + ] + ) + z["mask"] = np.array( + [ + True, + True, + True, + True, + True, + False, + False, + True, + False, + True, + True, + True, + True, + False, + True, + False, + ] + ) z = zarr.open(base_path) return z + @pytest.fixture() def nullable_boolean_zarr_group_no_mask(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") - z["values"] = np.array([True, False, True, False, False, True, False, False, True, True, False, False, False, True, False, True]) + z["values"] = np.array( + [ + True, + False, + True, + False, + False, + True, + False, + False, + True, + True, + False, + False, + False, + True, + False, + True, + ] + ) z = zarr.open(base_path) return z + @pytest.fixture() def nullable_integer_zarr_group(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) - z["mask"] = np.array([True, True, True, True, True, False, False, True, False, True, True, True, True, False, True, False]) + z["mask"] = np.array( + [ + True, + True, + True, + True, + True, + False, + False, + True, + False, + True, + True, + True, + True, + False, + True, + False, + ] + ) z = zarr.open(base_path) return z + @pytest.fixture() def nullable_integer_zarr_group_no_mask(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() @@ -98,13 +183,17 @@ def nullable_integer_zarr_group_no_mask(tmp_path_factory): def test_access_count_obs_var(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" - M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access + M = 1000000 # forces zarr to chunk `obs` columns multiple ways - that way 1 access to `int64` below is actually only one access N = 5 obs_names = pd.Index(f"cell{i}" for i in range(M)) var_names = pd.Index(f"gene{i}" for i in range(N)) obs = gen_typed_df(M, obs_names) var = gen_typed_df(N, var_names) - orig = AnnData(obs=obs, var=var, X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32))) + orig = AnnData( + obs=obs, + var=var, + X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), + ) orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) store.set_key_trackers(["obs/int64", "var/int64", "obs/cat/codes"]) @@ -114,17 +203,22 @@ def test_access_count_obs_var(tmp_path, mtx_format): remote.shape remote.var remote.obs - remote.obs['int64'] - remote.var['int64'] + remote.obs["int64"] + remote.var["int64"] # only the `cat` should be read in - subset = remote[remote.obs['cat'] == 'a', :] - subset.obs['int64'] + subset = remote[remote.obs["cat"] == "a", :] + subset.obs["int64"] assert store.get_access_count("obs/int64") == 0 assert store.get_access_count("var/int64") == 0 - assert store.get_access_count("obs/cat/codes") == 4 # entire thing needs to be read in for subset - remote[0:10, :].obs['int64'][0:10].compute() - assert store.get_access_count("obs/int64") == 1 # one for 0, .zmetadata handles .zarray - assert store.get_access_count("var/int64") == 0 # never accessed + assert ( + store.get_access_count("obs/cat/codes") == 4 + ) # entire thing needs to be read in for subset + remote[0:10, :].obs["int64"][0:10].compute() + assert ( + store.get_access_count("obs/int64") == 1 + ) # one for 0, .zmetadata handles .zarray + assert store.get_access_count("var/int64") == 0 # never accessed + def test_access_count_obsp_varp(tmp_path, mtx_format): base_pth = Path(tmp_path) @@ -138,8 +232,8 @@ def test_access_count_obsp_varp(tmp_path, mtx_format): remote = read_backed(store) # these operations should not read in any data subset = remote[0:10, 500:600] - subset.obsp['array'] - subset.varp['array'] + subset.obsp["array"] + subset.varp["array"] assert store.get_access_count("obsp") == 0 assert store.get_access_count("varp") == 0 @@ -246,7 +340,7 @@ def test_lazy_categorical_array_subset_subset(categorical_zarr_group): def test_nullable_boolean_array_properties(nullable_boolean_zarr_group): - arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') + arr = LazyMaskedArray(nullable_boolean_zarr_group, "nullable-boolean") assert len(arr[0:3]) == 3 assert type(arr[0:3]) == pd.arrays.BooleanArray assert len(arr[()]) == len(arr) @@ -254,14 +348,14 @@ def test_nullable_boolean_array_properties(nullable_boolean_zarr_group): def test_nullable_boolean_array_equality(nullable_boolean_zarr_group): - arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') + arr = LazyMaskedArray(nullable_boolean_zarr_group, "nullable-boolean") assert (arr[0] == pd.NA).all() assert (arr[3:5] == pd.NA).all() assert (arr[5:7] == np.array([True, False])).all() def test_nullable_boolean_array_subset_subset(nullable_boolean_zarr_group): - arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') + arr = LazyMaskedArray(nullable_boolean_zarr_group, "nullable-boolean") subset_susbet = arr[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.BooleanArray @@ -273,25 +367,31 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_zarr_group): ) ).all() + def test_nullable_boolean_array_no_mask_equality(nullable_boolean_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, 'nullable-boolean') - assert (arr[0] == True).all() - assert (arr[3:5] == False).all() + arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, "nullable-boolean") + assert (arr[0] is True).all() + assert (arr[3:5] is False).all() assert (arr[5:7] == np.array([True, False])).all() -def test_nullable_boolean_array_no_mask_subset_subset(nullable_boolean_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, 'nullable-boolean') +def test_nullable_boolean_array_no_mask_subset_subset( + nullable_boolean_zarr_group_no_mask, +): + arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, "nullable-boolean") subset_susbet = arr[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.BooleanArray assert ( subset_susbet[()] - == pd.array(np.array([True, False, False, True, True]),) + == pd.array( + np.array([True, False, False, True, True]), + ) ).all() + def test_nullable_integer_array_properties(nullable_integer_zarr_group): - arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') + arr = LazyMaskedArray(nullable_integer_zarr_group, "nullable-integer") assert len(arr[0:3]) == 3 assert type(arr[0:3]) == pd.arrays.IntegerArray assert len(arr[()]) == len(arr) @@ -299,14 +399,14 @@ def test_nullable_integer_array_properties(nullable_integer_zarr_group): def test_nullable_integer_array_equality(nullable_integer_zarr_group): - arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') + arr = LazyMaskedArray(nullable_integer_zarr_group, "nullable-integer") assert (arr[0] == pd.NA).all() assert (arr[3:5] == pd.NA).all() assert (arr[5:7] == np.array([2, 2])).all() def test_nullable_integer_array_subset_subset(nullable_integer_zarr_group): - arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') + arr = LazyMaskedArray(nullable_integer_zarr_group, "nullable-integer") subset_susbet = arr[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.IntegerArray @@ -318,23 +418,24 @@ def test_nullable_integer_array_subset_subset(nullable_integer_zarr_group): ) ).all() + def test_nullable_integer_array_no_mask_equality(nullable_integer_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, 'nullable-integer') + arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, "nullable-integer") assert (arr[0] == pd.NA).all() assert (arr[3:5] == 1).all() assert (arr[5:7] == np.array([2, 2])).all() -def test_nullable_integer_array_no_mask_subset_subset(nullable_integer_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, 'nullable-integer') +def test_nullable_integer_array_no_mask_subset_subset( + nullable_integer_zarr_group_no_mask, +): + arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, "nullable-integer") subset_susbet = arr[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.IntegerArray assert ( subset_susbet[()] - == pd.array(np.array([2, 2, 1, 2, 0]),) + == pd.array( + np.array([2, 2, 1, 2, 0]), + ) ).all() - - - - From 6ca000330e881ea950902fed66b1805a77b2e3d4 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 May 2023 12:15:01 +0200 Subject: [PATCH 087/125] (fix): dont `deepcopy` the lazy array --- .../experimental/read_backed/lazy_arrays.py | 29 +++-- .../experimental/read_backed/read_backed.py | 4 +- .../tests/test_read_backed_experimental.py | 123 ++++++++---------- 3 files changed, 76 insertions(+), 80 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 6692d467b..a7464cfc7 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -1,4 +1,3 @@ -from copy import deepcopy from typing import Tuple from anndata._core.index import Index, _subset from anndata._core.views import _resolve_idx, as_view @@ -43,19 +42,19 @@ def __ne__(self, __o) -> np.ndarray: class LazyCategoricalArray(MaskedArrayMixIn): - __slots__ = ("values", "attrs", "_categories", "_categories_cache", "_subset_idx") + __slots__ = ("values", "attrs", "_categories", "_categories_cache", "_subset_idx", "group") - def __init__(self, group, *args, **kwargs): + def __init__(self, codes, categories, attrs, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group Args: group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr """ - self.values = group["codes"] - self._categories = group["categories"] + self.values = codes + self._categories = categories self._categories_cache = None self._subset_idx = None - self.attrs = dict(group.attrs) + self.attrs = dict(attrs) @property def categories(self): # __slots__ and cached_property are incompatible @@ -84,20 +83,25 @@ def __getitem__(self, selection) -> pd.Categorical: def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" + + def copy(self): + arr = LazyCategoricalArray(self.values, self.categories, self.attrs) + arr.subset_idx = self.subset_idx + return arr class LazyMaskedArray(MaskedArrayMixIn): __slots__ = ("mask", "values", "_subset_idx", "_dtype_str") - def __init__(self, group, dtype_str, *args, **kwargs): + def __init__(self, values, mask, dtype_str, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group Args: group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` """ - self.values = group["values"] - self.mask = group["mask"] if "mask" in group else None + self.values = values + self.mask = mask self._subset_idx = None self._dtype_str = dtype_str @@ -128,10 +132,15 @@ def __repr__(self) -> str: return "LazyNullableIntegerArray" elif self._dtype_str == "nullable-boolean": return "LazyNullableBooleanArray" + + def copy(self): + arr = LazyMaskedArray(self.values, self.mask, self._dtype_str) + arr.subset_idx = self.subset_idx + return arr @_subset.register(MaskedArrayMixIn) def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): - a_copy = deepcopy(a) + a_copy = a.copy() a_copy.subset_idx = subset_idx[0] # this is a tuple? return a_copy diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 322b12677..4ec6b914f 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -343,9 +343,9 @@ def callback(func, elem_name: str, elem, iospec): ) return {k: read_dispatched(v, callback) for k, v in iter_object} elif iospec.encoding_type == "categorical": - return LazyCategoricalArray(elem) + return LazyCategoricalArray(elem['codes'], elem['categories'], elem.attrs) elif "nullable" in iospec.encoding_type: - return LazyMaskedArray(elem, iospec.encoding_type) + return LazyMaskedArray(elem['values'], elem['mask'] if 'mask' in elem else None, iospec.encoding_type) elif iospec.encoding_type in {"array", "string-array"}: return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index ce0abc7e3..96a9e8884 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -51,48 +51,48 @@ def sparse_format(request): @pytest.fixture() -def categorical_zarr_group(tmp_path_factory): +def categorical_lazy_arr(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") z["codes"] =np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) z["categories"] = np.array(["foo", "bar", "jazz"]) z.attrs["ordered"] = False z = zarr.open(base_path) - return z + return LazyCategoricalArray(z['codes'], z['categories'], z.attrs) @pytest.fixture() -def nullable_boolean_zarr_group(tmp_path_factory): +def nullable_boolean_lazy_arr(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") z["values"] =np.array([True, False, True, False, False, True, False, False, True, True, False, False, False, True, False, True]) z["mask"] = np.array([True, True, True, True, True, False, False, True, False, True, True, True, True, False, True, False]) z = zarr.open(base_path) - return z + return LazyMaskedArray(z['values'], z['mask'], 'nullable-boolean') @pytest.fixture() -def nullable_boolean_zarr_group_no_mask(tmp_path_factory): +def nullable_boolean_lazy_arr_no_mask(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") z["values"] = np.array([True, False, True, False, False, True, False, False, True, True, False, False, False, True, False, True]) z = zarr.open(base_path) - return z + return LazyMaskedArray(z['values'], None, 'nullable-boolean') @pytest.fixture() -def nullable_integer_zarr_group(tmp_path_factory): +def nullable_integer_lazy_arr(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) z["mask"] = np.array([True, True, True, True, True, False, False, True, False, True, True, True, True, False, True, False]) z = zarr.open(base_path) - return z + return LazyMaskedArray(z['values'], z['mask'], 'nullable-integer') @pytest.fixture() -def nullable_integer_zarr_group_no_mask(tmp_path_factory): +def nullable_integer_lazy_arr_no_mask(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() z = zarr.open_group(base_path, mode="w") z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) z = zarr.open(base_path) - return z + return LazyMaskedArray(z['values'], None, 'nullable-integer') def test_access_count_obs_var(tmp_path, mtx_format): @@ -215,24 +215,21 @@ def test_read_write_view_of_view(tmp_path, mtx_format): ).all() -def test_lazy_categorical_array_properties(categorical_zarr_group): - arr = LazyCategoricalArray(categorical_zarr_group) - assert len(arr[0:3]) == 3 - assert type(arr[0:3]) == pd.Categorical - assert len(arr[()]) == len(arr) - assert type(arr[()]) == pd.Categorical +def test_lazy_categorical_array_properties(categorical_lazy_arr): + assert len(categorical_lazy_arr[0:3]) == 3 + assert type(categorical_lazy_arr[0:3]) == pd.Categorical + assert len(categorical_lazy_arr[()]) == len(categorical_lazy_arr) + assert type(categorical_lazy_arr[()]) == pd.Categorical -def test_lazy_categorical_array_equality(categorical_zarr_group): - arr = LazyCategoricalArray(categorical_zarr_group) - assert (arr[0] == "foo").all() - assert (arr[3:5] == "bar").all() - assert (arr == "foo").any() +def test_lazy_categorical_array_equality(categorical_lazy_arr): + assert (categorical_lazy_arr[0] == "foo").all() + assert (categorical_lazy_arr[3:5] == "bar").all() + assert (categorical_lazy_arr == "foo").any() -def test_lazy_categorical_array_subset_subset(categorical_zarr_group): - arr = LazyCategoricalArray(categorical_zarr_group) - subset_susbet = arr[0:10][5:10] +def test_lazy_categorical_array_subset_subset(categorical_lazy_arr): + subset_susbet = categorical_lazy_arr[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.Categorical assert ( @@ -245,24 +242,21 @@ def test_lazy_categorical_array_subset_subset(categorical_zarr_group): ).all() -def test_nullable_boolean_array_properties(nullable_boolean_zarr_group): - arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') - assert len(arr[0:3]) == 3 - assert type(arr[0:3]) == pd.arrays.BooleanArray - assert len(arr[()]) == len(arr) - assert type(arr[()]) == pd.arrays.BooleanArray +def test_nullable_boolean_array_properties(nullable_boolean_lazy_arr): + assert len(nullable_boolean_lazy_arr[0:3]) == 3 + assert type(nullable_boolean_lazy_arr[0:3]) == pd.arrays.BooleanArray + assert len(nullable_boolean_lazy_arr[()]) == len(nullable_boolean_lazy_arr) + assert type(nullable_boolean_lazy_arr[()]) == pd.arrays.BooleanArray -def test_nullable_boolean_array_equality(nullable_boolean_zarr_group): - arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') - assert (arr[0] == pd.NA).all() - assert (arr[3:5] == pd.NA).all() - assert (arr[5:7] == np.array([True, False])).all() +def test_nullable_boolean_array_equality(nullable_boolean_lazy_arr): + assert (nullable_boolean_lazy_arr[0] == pd.NA).all() + assert (nullable_boolean_lazy_arr[3:5] == pd.NA).all() + assert (nullable_boolean_lazy_arr[5:7] == np.array([True, False])).all() -def test_nullable_boolean_array_subset_subset(nullable_boolean_zarr_group): - arr = LazyMaskedArray(nullable_boolean_zarr_group, 'nullable-boolean') - subset_susbet = arr[0:10][5:10] +def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): + subset_susbet = nullable_boolean_lazy_arr[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.BooleanArray assert ( @@ -273,16 +267,14 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_zarr_group): ) ).all() -def test_nullable_boolean_array_no_mask_equality(nullable_boolean_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, 'nullable-boolean') - assert (arr[0] == True).all() - assert (arr[3:5] == False).all() - assert (arr[5:7] == np.array([True, False])).all() +def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): + assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() + assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() -def test_nullable_boolean_array_no_mask_subset_subset(nullable_boolean_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_boolean_zarr_group_no_mask, 'nullable-boolean') - subset_susbet = arr[0:10][5:10] +def test_nullable_boolean_array_no_mask_subset_subset(nullable_boolean_lazy_arr_no_mask): + subset_susbet = nullable_boolean_lazy_arr_no_mask[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.BooleanArray assert ( @@ -290,24 +282,21 @@ def test_nullable_boolean_array_no_mask_subset_subset(nullable_boolean_zarr_grou == pd.array(np.array([True, False, False, True, True]),) ).all() -def test_nullable_integer_array_properties(nullable_integer_zarr_group): - arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') - assert len(arr[0:3]) == 3 - assert type(arr[0:3]) == pd.arrays.IntegerArray - assert len(arr[()]) == len(arr) - assert type(arr[()]) == pd.arrays.IntegerArray +def test_nullable_integer_array_properties(nullable_integer_lazy_arr): + assert len(nullable_integer_lazy_arr[0:3]) == 3 + assert type(nullable_integer_lazy_arr[0:3]) == pd.arrays.IntegerArray + assert len(nullable_integer_lazy_arr[()]) == len(nullable_integer_lazy_arr) + assert type(nullable_integer_lazy_arr[()]) == pd.arrays.IntegerArray -def test_nullable_integer_array_equality(nullable_integer_zarr_group): - arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') - assert (arr[0] == pd.NA).all() - assert (arr[3:5] == pd.NA).all() - assert (arr[5:7] == np.array([2, 2])).all() +def test_nullable_integer_array_equality(nullable_integer_lazy_arr): + assert (nullable_integer_lazy_arr[0] == pd.NA).all() + assert (nullable_integer_lazy_arr[3:5] == pd.NA).all() + assert (nullable_integer_lazy_arr[5:7] == np.array([2, 2])).all() -def test_nullable_integer_array_subset_subset(nullable_integer_zarr_group): - arr = LazyMaskedArray(nullable_integer_zarr_group, 'nullable-integer') - subset_susbet = arr[0:10][5:10] +def test_nullable_integer_array_subset_subset(nullable_integer_lazy_arr): + subset_susbet = nullable_integer_lazy_arr[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.IntegerArray assert ( @@ -318,16 +307,14 @@ def test_nullable_integer_array_subset_subset(nullable_integer_zarr_group): ) ).all() -def test_nullable_integer_array_no_mask_equality(nullable_integer_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, 'nullable-integer') - assert (arr[0] == pd.NA).all() - assert (arr[3:5] == 1).all() - assert (arr[5:7] == np.array([2, 2])).all() +def test_nullable_integer_array_no_mask_equality(nullable_integer_lazy_arr_no_mask): + assert (nullable_integer_lazy_arr_no_mask[0] == pd.NA).all() + assert (nullable_integer_lazy_arr_no_mask[3:5] == 1).all() + assert (nullable_integer_lazy_arr_no_mask[5:7] == np.array([2, 2])).all() -def test_nullable_integer_array_no_mask_subset_subset(nullable_integer_zarr_group_no_mask): - arr = LazyMaskedArray(nullable_integer_zarr_group_no_mask, 'nullable-integer') - subset_susbet = arr[0:10][5:10] +def test_nullable_integer_array_no_mask_subset_subset(nullable_integer_lazy_arr_no_mask): + subset_susbet = nullable_integer_lazy_arr_no_mask[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.IntegerArray assert ( From 0b2ae47b3a854ac2ab4662f9fbcbfba87144d0d6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 May 2023 12:15:14 +0200 Subject: [PATCH 088/125] (feat): `to_memory` for `AnnData` object --- .../experimental/read_backed/read_backed.py | 45 ++++++++++++++++++- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 4ec6b914f..575783465 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -18,6 +18,7 @@ from anndata._core.raw import Raw from anndata._core.sparse_dataset import sparse_dataset from anndata._core.views import _resolve_idxs +from anndata.compat import DaskArray from anndata.utils import convert_to_dict import zarr @@ -156,7 +157,45 @@ def __getitem__(self, index: Index) -> "AnnData": return AnnDataBacked(self, oidx=oidx, vidx=vidx) def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: - return _normalize_indices(index, self.obs_names, self.var_names) + return _normalize_indices( + index, + pd.Index(self.obs_names.compute()), + pd.Index(self.var_names.compute()), + ) + + def to_memory(self, exclude_X=False): + def backed_dict_to_memory(d): + res = {} + for k, v in d.items(): + if isinstance(v, DaskArray): + res[k] = v[...].compute() + else: + res[k] = v[...] + return res + + obs_dict = backed_dict_to_memory(dict(self.obs)) + obs = pd.DataFrame(obs_dict, index=obs_dict[self.file["obs"].attrs["_index"]]) + var_dict = backed_dict_to_memory(dict(self.var)) + var = pd.DataFrame(var_dict, index=var_dict[self.file["var"].attrs["_index"]]) + obsm = backed_dict_to_memory(dict(self.obsm)) + varm = backed_dict_to_memory(dict(self.varm)) + varp = backed_dict_to_memory(dict(self.varp)) + obsp = backed_dict_to_memory(dict(self.obsp)) + layers = backed_dict_to_memory(dict(self.layers)) + X = None + if not exclude_X: + X = self.X[...].compute() if isinstance(self.X, DaskArray) else self.X[...] + return AnnData( + X=X, + obs=obs, + var=var, + obsm=obsm, + varm=varm, + obsp=obsp, + varp=varp, + layers=layers, + uns=self.uns, + ) @property def obs_names(self) -> pd.Index: @@ -292,7 +331,9 @@ def n_obs(self) -> int: return len(self.obs_names) def __repr__(self): - descr = f"AnnDataBacked object with n_obs × n_vars = {self.n_obs} × {self.n_vars}" + descr = ( + f"AnnDataBacked object with n_obs × n_vars = {self.n_obs} × {self.n_vars}" + ) for attr in [ "obs", "var", From 05337489b09b483b16310442be13128b571b38a3 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 10 May 2023 13:41:03 +0200 Subject: [PATCH 089/125] (feat): add `to_memory` test + corresponding fixes --- .../read_backed/lazy_axis_arrays.py | 6 ++--- .../experimental/read_backed/read_backed.py | 25 ++++++++----------- .../tests/test_read_backed_experimental.py | 18 ++++++++++--- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 6fc5a5c7a..1b38115c8 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -20,12 +20,12 @@ def columns(self) -> List: -def to_df_1d_axis_arrays(axis_arrays, idx=None): +def to_df_1d_axis_arrays(axis_arrays): """Convert to pandas dataframe.""" - df = pd.DataFrame(index=axis_arrays.dim_names[() if idx is None else idx]) + df = pd.DataFrame(index=axis_arrays.dim_names[()]) for key in axis_arrays.keys(): if "index" not in key: - df[key] = axis_arrays[key][() if idx is None else idx] + df[key] = axis_arrays[key][()] return df class AxisArraysRemote1dMixin(): diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 575783465..4537ad760 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -16,10 +16,10 @@ from anndata._core.anndata_base import AbstractAnnData from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw -from anndata._core.sparse_dataset import sparse_dataset +from anndata._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from anndata._core.views import _resolve_idxs from anndata.compat import DaskArray -from anndata.utils import convert_to_dict +from anndata.utils import asarray, convert_to_dict import zarr import pandas as pd @@ -167,16 +167,16 @@ def to_memory(self, exclude_X=False): def backed_dict_to_memory(d): res = {} for k, v in d.items(): - if isinstance(v, DaskArray): - res[k] = v[...].compute() - else: + if isinstance(v, DaskArray) or isinstance(v, BaseCompressedSparseDataset): + res[k] = asarray(v) + elif isinstance(v, LazyCategoricalArray) or isinstance(v, LazyMaskedArray): res[k] = v[...] + else: + res[k] = v return res - obs_dict = backed_dict_to_memory(dict(self.obs)) - obs = pd.DataFrame(obs_dict, index=obs_dict[self.file["obs"].attrs["_index"]]) - var_dict = backed_dict_to_memory(dict(self.var)) - var = pd.DataFrame(var_dict, index=var_dict[self.file["var"].attrs["_index"]]) + obs = self.obs.to_df() + var = self.var.to_df() obsm = backed_dict_to_memory(dict(self.obsm)) varm = backed_dict_to_memory(dict(self.varm)) varp = backed_dict_to_memory(dict(self.varp)) @@ -184,7 +184,7 @@ def backed_dict_to_memory(d): layers = backed_dict_to_memory(dict(self.layers)) X = None if not exclude_X: - X = self.X[...].compute() if isinstance(self.X, DaskArray) else self.X[...] + X = asarray(self.X) return AnnData( X=X, obs=obs, @@ -375,11 +375,8 @@ def callback(func, elem_name: str, elem, iospec): elif elem_name.startswith("/raw"): return None elif elem_name in {"/obs", "/var"}: - # override to only return AxisArray that will be accessed specially via our special AnnData object iter_object = ( - elem.items() - if is_consolidated - else [(k, elem[k]) for k in elem.attrs["column-order"]] + [(k, elem[k]) for k in elem.attrs["column-order"]] + [(elem.attrs["_index"], elem[elem.attrs["_index"]])] ) return {k: read_dispatched(v, callback) for k, v in iter_object} diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 96a9e8884..c64cbf18d 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -11,12 +11,13 @@ as_dense_dask_array, gen_adata, gen_typed_df, + assert_equal ) from anndata.experimental.read_backed import read_backed, LazyCategoricalArray, LazyMaskedArray from anndata.utils import asarray from zarr import DirectoryStore -import traceback + class AccessTrackingStore(DirectoryStore): def __init__(self, *args, **kwargs): @@ -144,7 +145,7 @@ def test_access_count_obsp_varp(tmp_path, mtx_format): assert store.get_access_count("varp") == 0 -def test_read_write_full(tmp_path, mtx_format): +def test_read_full(tmp_path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" @@ -155,8 +156,17 @@ def test_read_write_full(tmp_path, mtx_format): assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() +def test_to_memory(tmp_path, mtx_format): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + adata.write_zarr(orig_pth) + remote = read_backed(orig_pth) + remote_to_memory = remote.to_memory() + assert_equal(remote_to_memory, adata) + -def test_read_write_view(tmp_path, mtx_format): +def test_read_view(tmp_path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" @@ -179,7 +189,7 @@ def test_read_write_view(tmp_path, mtx_format): ).all() -def test_read_write_view_of_view(tmp_path, mtx_format): +def test_read_view_of_view(tmp_path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" From d0cdcd62f45a9f4a6b6d7f042472935204fd9274 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 10 May 2023 11:42:50 +0000 Subject: [PATCH 090/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../experimental/read_backed/lazy_arrays.py | 13 +++++++-- .../read_backed/lazy_axis_arrays.py | 1 - .../experimental/read_backed/read_backed.py | 23 ++++++++++----- .../tests/test_read_backed_experimental.py | 29 ++++++++++++------- 4 files changed, 44 insertions(+), 22 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 792ac056f..74a508531 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -42,7 +42,14 @@ def __ne__(self, __o) -> np.ndarray: class LazyCategoricalArray(MaskedArrayMixIn): - __slots__ = ("values", "attrs", "_categories", "_categories_cache", "_subset_idx", "group") + __slots__ = ( + "values", + "attrs", + "_categories", + "_categories_cache", + "_subset_idx", + "group", + ) def __init__(self, codes, categories, attrs, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group @@ -83,7 +90,7 @@ def __getitem__(self, selection) -> pd.Categorical: def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" - + def copy(self): arr = LazyCategoricalArray(self.values, self.categories, self.attrs) arr.subset_idx = self.subset_idx @@ -132,7 +139,7 @@ def __repr__(self) -> str: return "LazyNullableIntegerArray" elif self._dtype_str == "nullable-boolean": return "LazyNullableBooleanArray" - + def copy(self): arr = LazyMaskedArray(self.values, self.mask, self._dtype_str) arr.subset_idx = self.subset_idx diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 38e430860..1ee1f589e 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -18,7 +18,6 @@ def columns(self) -> List: return list(self.keys()) - def to_df_1d_axis_arrays(axis_arrays): """Convert to pandas dataframe.""" df = pd.DataFrame(index=axis_arrays.dim_names[()]) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 4537ad760..79989382b 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -167,9 +167,13 @@ def to_memory(self, exclude_X=False): def backed_dict_to_memory(d): res = {} for k, v in d.items(): - if isinstance(v, DaskArray) or isinstance(v, BaseCompressedSparseDataset): + if isinstance(v, DaskArray) or isinstance( + v, BaseCompressedSparseDataset + ): res[k] = asarray(v) - elif isinstance(v, LazyCategoricalArray) or isinstance(v, LazyMaskedArray): + elif isinstance(v, LazyCategoricalArray) or isinstance( + v, LazyMaskedArray + ): res[k] = v[...] else: res[k] = v @@ -375,15 +379,18 @@ def callback(func, elem_name: str, elem, iospec): elif elem_name.startswith("/raw"): return None elif elem_name in {"/obs", "/var"}: - iter_object = ( - [(k, elem[k]) for k in elem.attrs["column-order"]] - + [(elem.attrs["_index"], elem[elem.attrs["_index"]])] - ) + iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ + (elem.attrs["_index"], elem[elem.attrs["_index"]]) + ] return {k: read_dispatched(v, callback) for k, v in iter_object} elif iospec.encoding_type == "categorical": - return LazyCategoricalArray(elem['codes'], elem['categories'], elem.attrs) + return LazyCategoricalArray(elem["codes"], elem["categories"], elem.attrs) elif "nullable" in iospec.encoding_type: - return LazyMaskedArray(elem['values'], elem['mask'] if 'mask' in elem else None, iospec.encoding_type) + return LazyMaskedArray( + elem["values"], + elem["mask"] if "mask" in elem else None, + iospec.encoding_type, + ) elif iospec.encoding_type in {"array", "string-array"}: return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 6b8c0adcf..cc2577bca 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -11,7 +11,7 @@ as_dense_dask_array, gen_adata, gen_typed_df, - assert_equal + assert_equal, ) from anndata.experimental.read_backed import ( read_backed, @@ -22,6 +22,7 @@ from zarr import DirectoryStore + class AccessTrackingStore(DirectoryStore): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -62,7 +63,7 @@ def categorical_lazy_arr(tmp_path_factory): z["categories"] = np.array(["foo", "bar", "jazz"]) z.attrs["ordered"] = False z = zarr.open(base_path) - return LazyCategoricalArray(z['codes'], z['categories'], z.attrs) + return LazyCategoricalArray(z["codes"], z["categories"], z.attrs) @pytest.fixture() @@ -110,7 +111,7 @@ def nullable_boolean_lazy_arr(tmp_path_factory): ] ) z = zarr.open(base_path) - return LazyMaskedArray(z['values'], z['mask'], 'nullable-boolean') + return LazyMaskedArray(z["values"], z["mask"], "nullable-boolean") @pytest.fixture() @@ -138,7 +139,7 @@ def nullable_boolean_lazy_arr_no_mask(tmp_path_factory): ] ) z = zarr.open(base_path) - return LazyMaskedArray(z['values'], None, 'nullable-boolean') + return LazyMaskedArray(z["values"], None, "nullable-boolean") @pytest.fixture() @@ -167,7 +168,7 @@ def nullable_integer_lazy_arr(tmp_path_factory): ] ) z = zarr.open(base_path) - return LazyMaskedArray(z['values'], z['mask'], 'nullable-integer') + return LazyMaskedArray(z["values"], z["mask"], "nullable-integer") @pytest.fixture() @@ -176,7 +177,7 @@ def nullable_integer_lazy_arr_no_mask(tmp_path_factory): z = zarr.open_group(base_path, mode="w") z["values"] = np.array([0, 1, 0, 1, 1, 2, 2, 1, 2, 0, 1, 1, 1, 2, 1, 2]) z = zarr.open(base_path) - return LazyMaskedArray(z['values'], None, 'nullable-integer') + return LazyMaskedArray(z["values"], None, "nullable-integer") def test_access_count_obs_var(tmp_path, mtx_format): @@ -248,6 +249,7 @@ def test_read_full(tmp_path, mtx_format): assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() + def test_to_memory(tmp_path, mtx_format): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -369,13 +371,16 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): ) ).all() + def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert (nullable_boolean_lazy_arr_no_mask[0] is True).all() + assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() -def test_nullable_boolean_array_no_mask_subset_subset(nullable_boolean_lazy_arr_no_mask): +def test_nullable_boolean_array_no_mask_subset_subset( + nullable_boolean_lazy_arr_no_mask, +): subset_susbet = nullable_boolean_lazy_arr_no_mask[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.BooleanArray @@ -386,6 +391,7 @@ def test_nullable_boolean_array_no_mask_subset_subset(nullable_boolean_lazy_arr_ ) ).all() + def test_nullable_integer_array_properties(nullable_integer_lazy_arr): assert len(nullable_integer_lazy_arr[0:3]) == 3 assert type(nullable_integer_lazy_arr[0:3]) == pd.arrays.IntegerArray @@ -411,13 +417,16 @@ def test_nullable_integer_array_subset_subset(nullable_integer_lazy_arr): ) ).all() + def test_nullable_integer_array_no_mask_equality(nullable_integer_lazy_arr_no_mask): assert (nullable_integer_lazy_arr_no_mask[0] == pd.NA).all() assert (nullable_integer_lazy_arr_no_mask[3:5] == 1).all() assert (nullable_integer_lazy_arr_no_mask[5:7] == np.array([2, 2])).all() -def test_nullable_integer_array_no_mask_subset_subset(nullable_integer_lazy_arr_no_mask): +def test_nullable_integer_array_no_mask_subset_subset( + nullable_integer_lazy_arr_no_mask, +): subset_susbet = nullable_integer_lazy_arr_no_mask[0:10][5:10] assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.IntegerArray From f212e307c3d8572ed1237dddd7a3ef9c4298eed2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 19 May 2023 10:02:02 +0200 Subject: [PATCH 091/125] (fix): resolve tuple ambiguity --- anndata/_core/aligned_mapping.py | 2 +- anndata/_core/index.py | 4 +++ .../experimental/read_backed/lazy_arrays.py | 4 +-- .../read_backed/lazy_axis_arrays.py | 8 +++--- .../experimental/read_backed/read_backed.py | 8 +++--- .../tests/test_read_backed_experimental.py | 26 +++++++++---------- 6 files changed, 29 insertions(+), 23 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index b5271d2c3..80e2b1b5a 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -119,7 +119,7 @@ def copy(self): def _view(self, parent: "anndata.AnnData", subset_idx: I): """Returns a subset copy-on-write view of the object.""" - if parent.is_view and subset_idx is not None: # and or or? + if parent.is_view or subset_idx is not None: # and or or? return self._view_class(self, parent, subset_idx) return self diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 112e4a1a5..92c4eb07d 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -128,6 +128,8 @@ def _subset(a: Union[np.ndarray, pd.DataFrame], subset_idx: Index): @_subset.register(DaskArray) def _subset_dask(a: DaskArray, subset_idx: Index): + if isinstance(subset_idx, slice): + return a[subset_idx] if all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) return a.vindex[subset_idx] @@ -149,6 +151,8 @@ def _subset_df(df: pd.DataFrame, subset_idx: Index): @_subset.register(AwkArray) def _subset_awkarray(a: AwkArray, subset_idx: Index): + if isinstance(subset_idx, slice): + return a[subset_idx] if all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) return a[subset_idx] diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 74a508531..e9e0f9c20 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -35,7 +35,7 @@ def shape(self) -> Tuple[int, ...]: return (len(self.subset_idx),) def __eq__(self, __o) -> np.ndarray: - return self[()] == __o + return self[...] == __o def __ne__(self, __o) -> np.ndarray: return ~(self == __o) @@ -149,7 +149,7 @@ def copy(self): @_subset.register(MaskedArrayMixIn) def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): a_copy = a.copy() - a_copy.subset_idx = subset_idx[0] # this is a tuple? + a_copy.subset_idx = subset_idx return a_copy diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 1ee1f589e..f2fec7fd7 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -20,10 +20,10 @@ def columns(self) -> List: def to_df_1d_axis_arrays(axis_arrays): """Convert to pandas dataframe.""" - df = pd.DataFrame(index=axis_arrays.dim_names[()]) + df = pd.DataFrame(index=axis_arrays.dim_names[...]) for key in axis_arrays.keys(): if "index" not in key: - df[key] = axis_arrays[key][()] + df[key] = axis_arrays[key][...] return df @@ -35,7 +35,9 @@ def to_df(self) -> pd.DataFrame: def iloc(self): class IlocDispatch: def __getitem__(self_iloc, idx): - return self._view(self.parent, (idx,)) + if type(idx) == list: + return self._view(self.parent, np.array(idx)) + return self._view(self.parent, idx) return IlocDispatch() diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 79989382b..46262636c 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -126,10 +126,10 @@ def __init__( self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp)) self.layers = Layers(adata_ref, layers) if self.is_view: - self.obs = self.obs._view(self, (oidx,)) - self.var = self.var._view(self, (vidx,)) - self.obsm = self.obsm._view(self, (oidx,)) - self.varm = self.varm._view(self, (vidx,)) + self.obs = self.obs._view(self, oidx) + self.var = self.var._view(self, vidx) + self.obsm = self.obsm._view(self, oidx) + self.varm = self.varm._view(self, vidx) self.obsp = self.obsp._view(self, oidx) self.varp = self.varp._view(self, vidx) self.layers = self.layers._view(self, (oidx, vidx)) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index cc2577bca..2595785ee 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -322,8 +322,8 @@ def test_read_view_of_view(tmp_path, mtx_format): def test_lazy_categorical_array_properties(categorical_lazy_arr): assert len(categorical_lazy_arr[0:3]) == 3 assert type(categorical_lazy_arr[0:3]) == pd.Categorical - assert len(categorical_lazy_arr[()]) == len(categorical_lazy_arr) - assert type(categorical_lazy_arr[()]) == pd.Categorical + assert len(categorical_lazy_arr[...]) == len(categorical_lazy_arr) + assert type(categorical_lazy_arr[...]) == pd.Categorical def test_lazy_categorical_array_equality(categorical_lazy_arr): @@ -337,7 +337,7 @@ def test_lazy_categorical_array_subset_subset(categorical_lazy_arr): assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.Categorical assert ( - subset_susbet[()] + subset_susbet[...] == pd.Categorical.from_codes( codes=[2, 2, 1, 2, 0], categories=["foo", "bar", "jazz"], @@ -349,8 +349,8 @@ def test_lazy_categorical_array_subset_subset(categorical_lazy_arr): def test_nullable_boolean_array_properties(nullable_boolean_lazy_arr): assert len(nullable_boolean_lazy_arr[0:3]) == 3 assert type(nullable_boolean_lazy_arr[0:3]) == pd.arrays.BooleanArray - assert len(nullable_boolean_lazy_arr[()]) == len(nullable_boolean_lazy_arr) - assert type(nullable_boolean_lazy_arr[()]) == pd.arrays.BooleanArray + assert len(nullable_boolean_lazy_arr[...]) == len(nullable_boolean_lazy_arr) + assert type(nullable_boolean_lazy_arr[...]) == pd.arrays.BooleanArray def test_nullable_boolean_array_equality(nullable_boolean_lazy_arr): @@ -364,7 +364,7 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.BooleanArray assert ( - subset_susbet[()] + subset_susbet[...] == pd.arrays.BooleanArray( values=np.array([True, False, False, True, True]), mask=np.array([False, False, True, False, True]), @@ -373,8 +373,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert (nullable_boolean_lazy_arr_no_mask[0] is True).all() - assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() + assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() + assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() @@ -385,7 +385,7 @@ def test_nullable_boolean_array_no_mask_subset_subset( assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.BooleanArray assert ( - subset_susbet[()] + subset_susbet[...] == pd.array( np.array([True, False, False, True, True]), ) @@ -395,8 +395,8 @@ def test_nullable_boolean_array_no_mask_subset_subset( def test_nullable_integer_array_properties(nullable_integer_lazy_arr): assert len(nullable_integer_lazy_arr[0:3]) == 3 assert type(nullable_integer_lazy_arr[0:3]) == pd.arrays.IntegerArray - assert len(nullable_integer_lazy_arr[()]) == len(nullable_integer_lazy_arr) - assert type(nullable_integer_lazy_arr[()]) == pd.arrays.IntegerArray + assert len(nullable_integer_lazy_arr[...]) == len(nullable_integer_lazy_arr) + assert type(nullable_integer_lazy_arr[...]) == pd.arrays.IntegerArray def test_nullable_integer_array_equality(nullable_integer_lazy_arr): @@ -410,7 +410,7 @@ def test_nullable_integer_array_subset_subset(nullable_integer_lazy_arr): assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.IntegerArray assert ( - subset_susbet[()] + subset_susbet[...] == pd.arrays.IntegerArray( values=np.array([2, 2, 1, 2, 0]), mask=np.array([False, False, True, False, True]), @@ -431,7 +431,7 @@ def test_nullable_integer_array_no_mask_subset_subset( assert len(subset_susbet) == 5 assert type(subset_susbet) == pd.arrays.IntegerArray assert ( - subset_susbet[()] + subset_susbet[...] == pd.array( np.array([2, 2, 1, 2, 0]), ) From a52d4940b263657198f2befe74b9ecbc0c1151d7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 19 May 2023 12:27:58 +0200 Subject: [PATCH 092/125] (fix): copy using backing `categories` array, not values --- anndata/experimental/read_backed/lazy_arrays.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index e9e0f9c20..228d9970c 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -92,7 +92,7 @@ def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" def copy(self): - arr = LazyCategoricalArray(self.values, self.categories, self.attrs) + arr = LazyCategoricalArray(self.values, self._categories, self.attrs) # self.categories reads in data arr.subset_idx = self.subset_idx return arr From e3dcec8470922d89c216a4987d557d3957ea0ba1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 19 May 2023 12:28:16 +0200 Subject: [PATCH 093/125] (feat): sparse arrays as dask --- anndata/experimental/read_backed/read_backed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 46262636c..dc168a864 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -24,6 +24,7 @@ import zarr import pandas as pd import dask.array as da +from scipy import sparse from ..._core import AnnData from .. import read_dispatched from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray @@ -394,7 +395,8 @@ def callback(func, elem_name: str, elem, iospec): elif iospec.encoding_type in {"array", "string-array"}: return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - return sparse_dataset(elem) + meta = sparse.random(1, 1, format=iospec.encoding_type[0:3], density=0.05) + return da.from_array(sparse_dataset(elem).to_backed(), meta=meta, name=elem_name) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) elif iospec.encoding_type in {"dataframe"}: From a418313d1428978adf7351c28793a2f205ce72e1 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 19 May 2023 13:07:31 +0200 Subject: [PATCH 094/125] (chore): add access checks on `X`, `layers --- .../tests/test_read_backed_experimental.py | 38 ++++++++++++++++++- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 2595785ee..616ec01ec 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -192,11 +192,11 @@ def test_access_count_obs_var(tmp_path, mtx_format): orig = AnnData( obs=obs, var=var, - X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), + X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)) ) orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) - store.set_key_trackers(["obs/int64", "var/int64", "obs/cat/codes"]) + store.set_key_trackers(["obs/int64", "var/int64", "obs/cat/codes", "X"]) remote = read_backed(store) # a series of methods that should __not__ read in any data remote.X @@ -208,6 +208,10 @@ def test_access_count_obs_var(tmp_path, mtx_format): # only the `cat` should be read in subset = remote[remote.obs["cat"] == "a", :] subset.obs["int64"] + sub_subset = subset[0:10, :] + sub_subset.obs["int64"] + sub_subset.X # getting a repr/accessing the element should not read in data for X (or layers) + assert store.get_access_count("X") == 0 assert store.get_access_count("obs/int64") == 0 assert store.get_access_count("var/int64") == 0 assert ( @@ -219,6 +223,33 @@ def test_access_count_obs_var(tmp_path, mtx_format): ) # one for 0, .zmetadata handles .zarray assert store.get_access_count("var/int64") == 0 # never accessed +def test_access_count_X_layers(tmp_path, mtx_format): + base_pth = Path(tmp_path) + orig_pth = base_pth / "orig.zarr" + M = 1000 + N = 1000 + orig = gen_adata((M, N), mtx_format) + orig.write_zarr(orig_pth) + store = AccessTrackingStore(orig_pth) + store.set_key_trackers(["layers", "X"]) + remote = read_backed(store) + # a series of methods that should __not__ read in any data + remote.X + remote.X.shape + remote.shape + remote.layers + subset = remote[0:500, 200:400] + subset.X + subset.X.shape + subset.shape + subset.layers + sub_subset = subset[10:20, 50:60] + sub_subset.X + sub_subset.X.shape + sub_subset.shape + sub_subset.layers # getting a repr/accessing the element should not read in data for X or layers + assert store.get_access_count("X") == 0 + assert store.get_access_count("layers") == 0 def test_access_count_obsp_varp(tmp_path, mtx_format): base_pth = Path(tmp_path) @@ -234,6 +265,9 @@ def test_access_count_obsp_varp(tmp_path, mtx_format): subset = remote[0:10, 500:600] subset.obsp["array"] subset.varp["array"] + sub_subset = subset[0:5, 0:50] + sub_subset.obsp["array"] + sub_subset.varp["array"] assert store.get_access_count("obsp") == 0 assert store.get_access_count("varp") == 0 From 800f688f267a2269896ba4b9bc98f8bc99b1cc86 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 24 May 2023 16:38:51 +0200 Subject: [PATCH 095/125] (fix): ensure return type of dask array --- .../experimental/read_backed/read_backed.py | 21 ++++++++++++------- .../tests/test_read_backed_experimental.py | 12 ++++++----- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index dc168a864..d2f17a531 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -16,7 +16,7 @@ from anndata._core.anndata_base import AbstractAnnData from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw -from anndata._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset +from anndata._core.sparse_dataset import get_backed_class from anndata._core.views import _resolve_idxs from anndata.compat import DaskArray from anndata.utils import asarray, convert_to_dict @@ -168,10 +168,8 @@ def to_memory(self, exclude_X=False): def backed_dict_to_memory(d): res = {} for k, v in d.items(): - if isinstance(v, DaskArray) or isinstance( - v, BaseCompressedSparseDataset - ): - res[k] = asarray(v) + if isinstance(v, DaskArray): + res[k] = v.compute() elif isinstance(v, LazyCategoricalArray) or isinstance( v, LazyMaskedArray ): @@ -189,7 +187,7 @@ def backed_dict_to_memory(d): layers = backed_dict_to_memory(dict(self.layers)) X = None if not exclude_X: - X = asarray(self.X) + X = asarray(self.X.compute()) return AnnData( X=X, obs=obs, @@ -395,8 +393,15 @@ def callback(func, elem_name: str, elem, iospec): elif iospec.encoding_type in {"array", "string-array"}: return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - meta = sparse.random(1, 1, format=iospec.encoding_type[0:3], density=0.05) - return da.from_array(sparse_dataset(elem).to_backed(), meta=meta, name=elem_name) + format_str = iospec.encoding_type[0:3] + format_class = get_backed_class(format_str) + backed = format_class(tuple(elem.attrs['shape']), dtype=elem["data"].dtype) + backed.data = da.from_zarr(elem["data"]) + backed.indices = da.from_zarr(elem["indices"]) + backed.indptr = da.from_zarr(elem["indptr"]) + meta = format_class((1, 1), dtype=elem["data"].dtype) + arr = da.from_array(backed, meta=meta, name=False, asarray=False) + return arr elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) elif iospec.encoding_type in {"dataframe"}: diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 616ec01ec..86b627e81 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -278,7 +278,9 @@ def test_read_full(tmp_path, mtx_format): orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) remote = read_backed(orig_pth) - assert np.all(asarray(adata.X) == asarray(remote.X)) + base_x = asarray(adata.X) + remote_x = asarray(remote.X.compute()) + assert np.all(remote_x == base_x) assert (adata.obs == remote.obs.to_df()[adata.obs.columns]).all().all() assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() @@ -301,7 +303,7 @@ def test_read_view(tmp_path, mtx_format): adata.write_zarr(orig_pth) remote = read_backed(orig_pth) subset = adata.obs["obs_cat"] == "a" - assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X)) + assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X.compute())) assert ( (adata[subset, :].obs == remote[subset, :].obs.to_df()[adata.obs.columns]) .all() @@ -329,7 +331,7 @@ def test_read_view_of_view(tmp_path, mtx_format): subsetted_subsetted_adata = subsetted_adata[subset_subset, :] assert np.all( asarray(subsetted_subsetted_adata.X) - == asarray(remote[subset, :][subset_subset, :].X) + == asarray(remote[subset, :][subset_subset, :].X.compute()) ) assert ( ( @@ -407,8 +409,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert (nullable_boolean_lazy_arr_no_mask[0] is True).all() + assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 8ef6aeace1a95902c3edc112fa198e7afffcdc29 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 24 May 2023 14:39:22 +0000 Subject: [PATCH 096/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/experimental/read_backed/lazy_arrays.py | 4 +++- anndata/experimental/read_backed/read_backed.py | 2 +- anndata/tests/test_read_backed_experimental.py | 8 +++++--- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 228d9970c..3565ac10d 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -92,7 +92,9 @@ def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" def copy(self): - arr = LazyCategoricalArray(self.values, self._categories, self.attrs) # self.categories reads in data + arr = LazyCategoricalArray( + self.values, self._categories, self.attrs + ) # self.categories reads in data arr.subset_idx = self.subset_idx return arr diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index d2f17a531..777ec36e2 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -395,7 +395,7 @@ def callback(func, elem_name: str, elem, iospec): elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: format_str = iospec.encoding_type[0:3] format_class = get_backed_class(format_str) - backed = format_class(tuple(elem.attrs['shape']), dtype=elem["data"].dtype) + backed = format_class(tuple(elem.attrs["shape"]), dtype=elem["data"].dtype) backed.data = da.from_zarr(elem["data"]) backed.indices = da.from_zarr(elem["indices"]) backed.indptr = da.from_zarr(elem["indptr"]) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 86b627e81..f08c98459 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -192,7 +192,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): orig = AnnData( obs=obs, var=var, - X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)) + X=mtx_format(np.random.binomial(100, 0.005, (M, N)).astype(np.float32)), ) orig.write_zarr(orig_pth) store = AccessTrackingStore(orig_pth) @@ -210,7 +210,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): subset.obs["int64"] sub_subset = subset[0:10, :] sub_subset.obs["int64"] - sub_subset.X # getting a repr/accessing the element should not read in data for X (or layers) + sub_subset.X # getting a repr/accessing the element should not read in data for X (or layers) assert store.get_access_count("X") == 0 assert store.get_access_count("obs/int64") == 0 assert store.get_access_count("var/int64") == 0 @@ -223,6 +223,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): ) # one for 0, .zmetadata handles .zarray assert store.get_access_count("var/int64") == 0 # never accessed + def test_access_count_X_layers(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" @@ -247,10 +248,11 @@ def test_access_count_X_layers(tmp_path, mtx_format): sub_subset.X sub_subset.X.shape sub_subset.shape - sub_subset.layers # getting a repr/accessing the element should not read in data for X or layers + sub_subset.layers # getting a repr/accessing the element should not read in data for X or layers assert store.get_access_count("X") == 0 assert store.get_access_count("layers") == 0 + def test_access_count_obsp_varp(tmp_path, mtx_format): base_pth = Path(tmp_path) orig_pth = base_pth / "orig.zarr" From 944503e90b845ba56967e8b2b5d755256c537b53 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 25 May 2023 17:47:38 +0200 Subject: [PATCH 097/125] (feat): subset_idx on backed class --- anndata/_core/sparse_dataset.py | 96 +++++++++++++++---- .../experimental/read_backed/read_backed.py | 20 ++-- .../tests/test_read_backed_experimental.py | 36 +++++-- 3 files changed, 111 insertions(+), 41 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 79a46620c..fe91e48ab 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -23,6 +23,8 @@ import scipy.sparse as ss from scipy.sparse import _sparsetools +from anndata._core.views import _resolve_idx, as_view + from ..compat import _read_attr, ZarrArray try: @@ -264,9 +266,35 @@ class BaseCompressedSparseDataset(ABC): Analogous to :class:`h5py.Dataset ` or `zarr.Array`, but for sparse matrices. """ - def __init__(self, group: h5py.Group): + def __init__(self, group: Union[h5py.Group, zarr.Group]): type(self)._check_group_format(group) self.group = group + self._row_subset_idx = slice(None, None) + self._col_subset_idx = slice(None, None) + + @property + def row_subset_idx(self): + return self._row_subset_idx + + @row_subset_idx.setter + def row_subset_idx(self, new_idx): + self._row_subset_idx = ( + new_idx + if self.row_subset_idx is None + else _resolve_idx(self.row_subset_idx, new_idx, self.get_backing_shape()[0]) + ) + + @property + def col_subset_idx(self): + return self._col_subset_idx + + @col_subset_idx.setter + def col_subset_idx(self, new_idx): + self._col_subset_idx = ( + new_idx + if self.col_subset_idx is None + else _resolve_idx(self.col_subset_idx, new_idx, self.get_backing_shape()[1]) + ) @property def dtype(self) -> np.dtype: @@ -280,15 +308,38 @@ def _check_group_format(cls, group): @property def name(self) -> str: return self.group.name - - @property - def shape(self) -> Tuple[int, int]: + + def get_backing_shape(self) -> Tuple[int, int]: shape = _read_attr(self.group.attrs, "shape", None) if shape is None: # TODO warn shape = self.group.attrs.get("h5sparse_shape") return tuple(shape) + @property + def shape(self) -> Tuple[int, int]: + shape = self.get_backing_shape() + if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): + if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): + return tuple(shape) + row_length = 0 + col_length = 0 + if isinstance(self.row_subset_idx, slice): + if self.row_subset_idx == slice(None, None, None): + row_length = shape[0] + else: + row_length = self.row_subset_idx.stop - self.row_subset_idx.start + else: + row_length = len(self.row_subset_idx.flatten()) # can we assume a flatten method? + if isinstance(self.col_subset_idx, slice): + if self.col_subset_idx == slice(None, None, None): + col_length = shape[1] + else: + col_length = self.col_subset_idx.stop - self.col_subset_idx.start + else: + col_length = len(self.col_subset_idx.flatten()) # can we assume a flatten method? + return (row_length, col_length) + @property def value(self) -> ss.spmatrix: return self.to_memory() @@ -302,14 +353,12 @@ def __repr__(self) -> str: def __getitem__(self, index: Union[Index, Tuple[()]]) -> Union[float, ss.spmatrix]: row, col = self._normalize_index(index) - mtx = self.to_backed() - sub = mtx[row, col] - # If indexing is array x array it returns a backed_sparse_matrix - # Not sure what the performance is on that operation - if isinstance(sub, BackedSparseMatrix): - return get_memory_class(self.format_str)(sub) - else: - return sub + new_mtx = sparse_dataset(self.group) + new_mtx.row_subset_idx = self.row_subset_idx + new_mtx.row_subset_idx = row + new_mtx.col_subset_idx = self.col_subset_idx + new_mtx.col_subset_idx = col + return new_mtx def _normalize_index( self, index: Union[Index, Tuple[()]] @@ -391,19 +440,24 @@ def append(self, sparse_matrix: ss.spmatrix): def to_backed(self) -> BackedSparseMatrix: format_class = get_backed_class(self.format_str) - mtx = format_class(self.shape, dtype=self.dtype) + mtx = format_class(self.get_backing_shape(), dtype=self.dtype) mtx.data = self.group["data"] mtx.indices = self.group["indices"] mtx.indptr = self.group["indptr"] return mtx def to_memory(self) -> ss.spmatrix: - format_class = get_memory_class(self.format_str) - mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"][...] - mtx.indices = self.group["indices"][...] - mtx.indptr = self.group["indptr"][...] - return mtx + if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): + if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): + format_class = get_memory_class(self.format_str) + mtx = format_class(self.shape, dtype=self.dtype) + mtx.data = self.group["data"][...] + mtx.indices = self.group["indices"][...] + mtx.indptr = self.group["indptr"][...] + return mtx + mtx = self.to_backed() + mat = mtx[self.row_subset_idx, self.col_subset_idx] + return mat class CSRDataset(BaseCompressedSparseDataset): @@ -426,3 +480,7 @@ def sparse_dataset(group) -> BaseCompressedSparseDataset: @_subset.register(BaseCompressedSparseDataset) def subset_sparsedataset(d, subset_idx): return d[subset_idx] + +@as_view.register(BaseCompressedSparseDataset) +def _view_masked(a: BaseCompressedSparseDataset, view_args): + return a \ No newline at end of file diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index d2f17a531..08d768a0d 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -16,7 +16,7 @@ from anndata._core.anndata_base import AbstractAnnData from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw -from anndata._core.sparse_dataset import get_backed_class +from anndata._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from anndata._core.views import _resolve_idxs from anndata.compat import DaskArray from anndata.utils import asarray, convert_to_dict @@ -122,7 +122,6 @@ def __init__( self.obsm = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obsm)) self.varm = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(varm)) - self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp)) self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp)) self.layers = Layers(adata_ref, layers) @@ -174,6 +173,8 @@ def backed_dict_to_memory(d): v, LazyMaskedArray ): res[k] = v[...] + elif issubclass(BaseCompressedSparseDataset, type(v)): + X = v.to_memory() else: res[k] = v return res @@ -187,7 +188,10 @@ def backed_dict_to_memory(d): layers = backed_dict_to_memory(dict(self.layers)) X = None if not exclude_X: - X = asarray(self.X.compute()) + if isinstance(self.X, BaseCompressedSparseDataset): + X = self.X.to_memory() + else: + X = self.X.compute() return AnnData( X=X, obs=obs, @@ -393,15 +397,7 @@ def callback(func, elem_name: str, elem, iospec): elif iospec.encoding_type in {"array", "string-array"}: return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - format_str = iospec.encoding_type[0:3] - format_class = get_backed_class(format_str) - backed = format_class(tuple(elem.attrs['shape']), dtype=elem["data"].dtype) - backed.data = da.from_zarr(elem["data"]) - backed.indices = da.from_zarr(elem["indices"]) - backed.indptr = da.from_zarr(elem["indptr"]) - meta = format_class((1, 1), dtype=elem["data"].dtype) - arr = da.from_array(backed, meta=meta, name=False, asarray=False) - return arr + return sparse_dataset(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) elif iospec.encoding_type in {"dataframe"}: diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 86b627e81..f0703d4a3 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -278,9 +278,16 @@ def test_read_full(tmp_path, mtx_format): orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) remote = read_backed(orig_pth) - base_x = asarray(adata.X) - remote_x = asarray(remote.X.compute()) - assert np.all(remote_x == base_x) + if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: + assert np.all( + asarray(adata.X) + == asarray(remote.X.to_memory()) + ) + else: + assert np.all( + asarray(adata.X) + == asarray(remote.X.compute()) + ) assert (adata.obs == remote.obs.to_df()[adata.obs.columns]).all().all() assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() @@ -303,7 +310,10 @@ def test_read_view(tmp_path, mtx_format): adata.write_zarr(orig_pth) remote = read_backed(orig_pth) subset = adata.obs["obs_cat"] == "a" - assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X.compute())) + if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: + assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X.to_memory())) + else: + assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X.compute())) assert ( (adata[subset, :].obs == remote[subset, :].obs.to_df()[adata.obs.columns]) .all() @@ -329,10 +339,16 @@ def test_read_view_of_view(tmp_path, mtx_format): subsetted_adata = adata[subset, :] subset_subset = subsetted_adata.obs["obs_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[subset_subset, :] - assert np.all( - asarray(subsetted_subsetted_adata.X) - == asarray(remote[subset, :][subset_subset, :].X.compute()) - ) + if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: + assert np.all( + asarray(subsetted_subsetted_adata.X) + == asarray(remote[subset, :][subset_subset, :].X.to_memory()) + ) + else: + assert np.all( + asarray(subsetted_subsetted_adata.X) + == asarray(remote[subset, :][subset_subset, :].X.compute()) + ) assert ( ( subsetted_subsetted_adata.obs @@ -409,8 +425,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert (nullable_boolean_lazy_arr_no_mask[0] is True).all() - assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() + assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() + assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 15bbf785fc5b7fd4ac04186caf9b3f43823d2e29 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 25 May 2023 15:49:40 +0000 Subject: [PATCH 098/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/sparse_dataset.py | 29 ++++++++++++++----- .../experimental/read_backed/read_backed.py | 2 +- .../tests/test_read_backed_experimental.py | 22 +++++++------- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index fe91e48ab..806f950fd 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -308,7 +308,7 @@ def _check_group_format(cls, group): @property def name(self) -> str: return self.group.name - + def get_backing_shape(self) -> Tuple[int, int]: shape = _read_attr(self.group.attrs, "shape", None) if shape is None: @@ -319,8 +319,12 @@ def get_backing_shape(self) -> Tuple[int, int]: @property def shape(self) -> Tuple[int, int]: shape = self.get_backing_shape() - if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): - if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): + if isinstance(self.col_subset_idx, slice) and isinstance( + self.row_subset_idx, slice + ): + if self.col_subset_idx == slice( + None, None, None + ) and self.row_subset_idx == slice(None, None, None): return tuple(shape) row_length = 0 col_length = 0 @@ -330,14 +334,18 @@ def shape(self) -> Tuple[int, int]: else: row_length = self.row_subset_idx.stop - self.row_subset_idx.start else: - row_length = len(self.row_subset_idx.flatten()) # can we assume a flatten method? + row_length = len( + self.row_subset_idx.flatten() + ) # can we assume a flatten method? if isinstance(self.col_subset_idx, slice): if self.col_subset_idx == slice(None, None, None): col_length = shape[1] else: col_length = self.col_subset_idx.stop - self.col_subset_idx.start else: - col_length = len(self.col_subset_idx.flatten()) # can we assume a flatten method? + col_length = len( + self.col_subset_idx.flatten() + ) # can we assume a flatten method? return (row_length, col_length) @property @@ -447,8 +455,12 @@ def to_backed(self) -> BackedSparseMatrix: return mtx def to_memory(self) -> ss.spmatrix: - if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): - if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): + if isinstance(self.col_subset_idx, slice) and isinstance( + self.row_subset_idx, slice + ): + if self.col_subset_idx == slice( + None, None, None + ) and self.row_subset_idx == slice(None, None, None): format_class = get_memory_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"][...] @@ -481,6 +493,7 @@ def sparse_dataset(group) -> BaseCompressedSparseDataset: def subset_sparsedataset(d, subset_idx): return d[subset_idx] + @as_view.register(BaseCompressedSparseDataset) def _view_masked(a: BaseCompressedSparseDataset, view_args): - return a \ No newline at end of file + return a diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 08d768a0d..9571b8dc6 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -174,7 +174,7 @@ def backed_dict_to_memory(d): ): res[k] = v[...] elif issubclass(BaseCompressedSparseDataset, type(v)): - X = v.to_memory() + v.to_memory() else: res[k] = v return res diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 68afdfa8f..b3b9d05d8 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -281,15 +281,9 @@ def test_read_full(tmp_path, mtx_format): adata.write_zarr(orig_pth) remote = read_backed(orig_pth) if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: - assert np.all( - asarray(adata.X) - == asarray(remote.X.to_memory()) - ) + assert np.all(asarray(adata.X) == asarray(remote.X.to_memory())) else: - assert np.all( - asarray(adata.X) - == asarray(remote.X.compute()) - ) + assert np.all(asarray(adata.X) == asarray(remote.X.compute())) assert (adata.obs == remote.obs.to_df()[adata.obs.columns]).all().all() assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() @@ -313,9 +307,13 @@ def test_read_view(tmp_path, mtx_format): remote = read_backed(orig_pth) subset = adata.obs["obs_cat"] == "a" if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: - assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X.to_memory())) + assert np.all( + asarray(adata[subset, :].X) == asarray(remote[subset, :].X.to_memory()) + ) else: - assert np.all(asarray(adata[subset, :].X) == asarray(remote[subset, :].X.compute())) + assert np.all( + asarray(adata[subset, :].X) == asarray(remote[subset, :].X.compute()) + ) assert ( (adata[subset, :].obs == remote[subset, :].obs.to_df()[adata.obs.columns]) .all() @@ -427,8 +425,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert (nullable_boolean_lazy_arr_no_mask[0] is True).all() + assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From db2a3ef43c3dfd88c601cebee803dbf72f8cb503 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Fri, 26 May 2023 13:11:08 +0200 Subject: [PATCH 099/125] (fix): ensure old tests pass --- anndata/_core/sparse_dataset.py | 34 ++++++++++++++++++++------------- anndata/_io/specs/methods.py | 4 ++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index fe91e48ab..21fee9859 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -269,12 +269,19 @@ class BaseCompressedSparseDataset(ABC): def __init__(self, group: Union[h5py.Group, zarr.Group]): type(self)._check_group_format(group) self.group = group - self._row_subset_idx = slice(None, None) - self._col_subset_idx = slice(None, None) + self._row_subset_idx = slice(None, None, None) + self._col_subset_idx = slice(None, None, None) @property def row_subset_idx(self): return self._row_subset_idx + + @property + def has_subset_idx(self): + if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): + if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): + return True + return False @row_subset_idx.setter def row_subset_idx(self, new_idx): @@ -319,9 +326,8 @@ def get_backing_shape(self) -> Tuple[int, int]: @property def shape(self) -> Tuple[int, int]: shape = self.get_backing_shape() - if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): - if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): - return tuple(shape) + if self.has_subset_idx: + return tuple(shape) row_length = 0 col_length = 0 if isinstance(self.row_subset_idx, slice): @@ -447,17 +453,19 @@ def to_backed(self) -> BackedSparseMatrix: return mtx def to_memory(self) -> ss.spmatrix: - if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): - if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): - format_class = get_memory_class(self.format_str) - mtx = format_class(self.shape, dtype=self.dtype) - mtx.data = self.group["data"][...] - mtx.indices = self.group["indices"][...] - mtx.indptr = self.group["indptr"][...] - return mtx + if self.has_subset_idx: + format_class = get_memory_class(self.format_str) + mtx = format_class(self.shape, dtype=self.dtype) + mtx.data = self.group["data"][...] + mtx.indices = self.group["indices"][...] + mtx.indptr = self.group["indptr"][...] + return mtx mtx = self.to_backed() mat = mtx[self.row_subset_idx, self.col_subset_idx] return mat + + def toarray(self) -> np.ndarray: + return self.to_memory().toarray() class CSRDataset(BaseCompressedSparseDataset): diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 9e8f71e38..a4f172716 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -484,7 +484,7 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} write_sparse_compressed( f, k, - elem.to_backed(), + (elem.to_backed() if elem.has_subset_idx else elem.to_memory()), # if there is a subset on the elem, to_memory lazily reads in __only__ the subset _writer, fmt=elem.format_str, dataset_kwargs=dataset_kwargs, @@ -507,7 +507,7 @@ def read_sparse(elem, _reader): @_REGISTRY.register_read_partial(ZarrGroup, IOSpec("csc_matrix", "0.1.0")) @_REGISTRY.register_read_partial(ZarrGroup, IOSpec("csr_matrix", "0.1.0")) def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))): - return sparse_dataset(elem)[indices] + return sparse_dataset(elem)[indices].to_memory() ################# From e93306c431c2caec51573846f2cf40a55d81a2f9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 31 May 2023 16:36:20 +0200 Subject: [PATCH 100/125] (fix): name variable accurately --- anndata/_core/sparse_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index cceed6308..eba403923 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -277,7 +277,7 @@ def row_subset_idx(self): return self._row_subset_idx @property - def has_subset_idx(self): + def has_no_subset_idx(self): if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): return True @@ -326,7 +326,7 @@ def get_backing_shape(self) -> Tuple[int, int]: @property def shape(self) -> Tuple[int, int]: shape = self.get_backing_shape() - if self.has_subset_idx: + if self.has_no_subset_idx: return tuple(shape) row_length = 0 col_length = 0 @@ -457,7 +457,7 @@ def to_backed(self) -> BackedSparseMatrix: return mtx def to_memory(self) -> ss.spmatrix: - if self.has_subset_idx: + if self.has_no_subset_idx: format_class = get_memory_class(self.format_str) mtx = format_class(self.shape, dtype=self.dtype) mtx.data = self.group["data"][...] From c9de21d3f408cdcd3d30f9a5a3c183b58449bd4a Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 1 Jun 2023 11:58:33 +0200 Subject: [PATCH 101/125] (fix): use correct access pattern for `to_memory` --- anndata/_core/sparse_dataset.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index eba403923..e7f5af267 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -465,8 +465,11 @@ def to_memory(self) -> ss.spmatrix: mtx.indptr = self.group["indptr"][...] return mtx mtx = self.to_backed() - mat = mtx[self.row_subset_idx, self.col_subset_idx] - return mat + if self.format_str == 'csr': + mtx = mtx[self.row_subset_idx, :] + return mtx[:, self.col_subset_idx] + mtx = mtx[:, self.col_subset_idx] + return mtx[self.row_subset_idx, :] def toarray(self) -> np.ndarray: return self.to_memory().toarray() From 7c911a851bb83e363d5e4a702062545ee08fcc21 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 1 Jun 2023 11:58:50 +0200 Subject: [PATCH 102/125] (feat): add `exclude` feature for `to_memory` --- .../read_backed/lazy_axis_arrays.py | 13 +++++++--- .../experimental/read_backed/read_backed.py | 26 ++++++++++--------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index f2fec7fd7..2581a6604 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -18,18 +18,19 @@ def columns(self) -> List: return list(self.keys()) -def to_df_1d_axis_arrays(axis_arrays): +def to_df_1d_axis_arrays(axis_arrays: AxisArrays, exclude=[]): """Convert to pandas dataframe.""" df = pd.DataFrame(index=axis_arrays.dim_names[...]) for key in axis_arrays.keys(): - if "index" not in key: + full_key = axis_arrays.attrname + '/' + key + if "index" not in key and all([full_key != exclude_key for exclude_key in exclude]): df[key] = axis_arrays[key][...] return df class AxisArraysRemote1dMixin: - def to_df(self) -> pd.DataFrame: - return to_df_1d_axis_arrays(self) + def to_df(self, exclude=[]) -> pd.DataFrame: + return to_df_1d_axis_arrays(self, exclude) @property def iloc(self): @@ -52,6 +53,10 @@ def _repr_html_(self): def _repr_latex_(self): return self.__repr__() + + @property + def attrname(self) -> str: + return self.dim class AxisArrays1dRemote(AxisArraysRemote1dMixin, AxisArraysRemote): diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 24ef2713f..80ae4513d 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -163,31 +163,33 @@ def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: pd.Index(self.var_names.compute()), ) - def to_memory(self, exclude_X=False): - def backed_dict_to_memory(d): + def to_memory(self, exclude=[]): + def backed_dict_to_memory(d, prefix): res = {} for k, v in d.items(): + full_key = prefix + '/' + k + if any([full_key == exclude_key for exclude_key in exclude]): + continue if isinstance(v, DaskArray): res[k] = v.compute() elif isinstance(v, LazyCategoricalArray) or isinstance( v, LazyMaskedArray ): res[k] = v[...] - elif issubclass(BaseCompressedSparseDataset, type(v)): + elif isinstance(v, BaseCompressedSparseDataset): res[k] = v.to_memory() else: res[k] = v return res - - obs = self.obs.to_df() - var = self.var.to_df() - obsm = backed_dict_to_memory(dict(self.obsm)) - varm = backed_dict_to_memory(dict(self.varm)) - varp = backed_dict_to_memory(dict(self.varp)) - obsp = backed_dict_to_memory(dict(self.obsp)) - layers = backed_dict_to_memory(dict(self.layers)) + obs = self.obs.to_df(exclude) + var = self.var.to_df(exclude) + obsm = backed_dict_to_memory(dict(self.obsm), 'obsm') + varm = backed_dict_to_memory(dict(self.varm), 'varm') + varp = backed_dict_to_memory(dict(self.varp), 'varp') + obsp = backed_dict_to_memory(dict(self.obsp), 'obsp') + layers = backed_dict_to_memory(dict(self.layers), 'layers') X = None - if not exclude_X: + if 'X' not in exclude: if isinstance(self.X, BaseCompressedSparseDataset): X = self.X.to_memory() else: From 41189283cba1331cbced7ac482e8bfe9aa03a2ca Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 1 Jun 2023 16:08:01 +0200 Subject: [PATCH 103/125] (fix): efficient reading in of matrices by splitting up reading over different axes --- anndata/_core/sparse_dataset.py | 50 +++++++---- .../tests/test_read_backed_experimental.py | 88 +++++++++++++++---- 2 files changed, 106 insertions(+), 32 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index e7f5af267..0428088f8 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -274,25 +274,40 @@ def __init__(self, group: Union[h5py.Group, zarr.Group]): @property def row_subset_idx(self): + if isinstance(self._row_subset_idx, np.ndarray): + return self._row_subset_idx.flatten() # why???? return self._row_subset_idx @property def has_no_subset_idx(self): - if isinstance(self.col_subset_idx, slice) and isinstance(self.row_subset_idx, slice): - if self.col_subset_idx == slice(None, None, None) and self.row_subset_idx == slice(None, None, None): + return self.has_no_col_subset_idx and self.has_no_row_subset_idx + + @property + def has_no_col_subset_idx(self): + if isinstance(self.col_subset_idx, slice): + if self.col_subset_idx == slice(None, None, None) or self.col_subset_idx == slice(0, self.get_backing_shape()[1], 1): return True - return False + return False + + @property + def has_no_row_subset_idx(self): + if isinstance(self.row_subset_idx, slice): + if self.row_subset_idx == slice(None, None, None) or self.row_subset_idx == slice(0, self.get_backing_shape()[0], 1): + return True + return False @row_subset_idx.setter def row_subset_idx(self, new_idx): self._row_subset_idx = ( new_idx if self.row_subset_idx is None - else _resolve_idx(self.row_subset_idx, new_idx, self.get_backing_shape()[0]) + else _resolve_idx(self.row_subset_idx, new_idx, self.shape[0]) ) @property def col_subset_idx(self): + if isinstance(self._col_subset_idx, np.ndarray): + return self._col_subset_idx.flatten() return self._col_subset_idx @col_subset_idx.setter @@ -300,7 +315,7 @@ def col_subset_idx(self, new_idx): self._col_subset_idx = ( new_idx if self.col_subset_idx is None - else _resolve_idx(self.col_subset_idx, new_idx, self.get_backing_shape()[1]) + else _resolve_idx(self.col_subset_idx, new_idx, self.shape[1]) ) @property @@ -335,18 +350,18 @@ def shape(self) -> Tuple[int, int]: row_length = shape[0] else: row_length = self.row_subset_idx.stop - self.row_subset_idx.start - else: + else: row_length = len( - self.row_subset_idx.flatten() + self.row_subset_idx ) # can we assume a flatten method? if isinstance(self.col_subset_idx, slice): if self.col_subset_idx == slice(None, None, None): col_length = shape[1] else: col_length = self.col_subset_idx.stop - self.col_subset_idx.start - else: + else: col_length = len( - self.col_subset_idx.flatten() + self.col_subset_idx ) # can we assume a flatten method? return (row_length, col_length) @@ -457,19 +472,20 @@ def to_backed(self) -> BackedSparseMatrix: return mtx def to_memory(self) -> ss.spmatrix: - if self.has_no_subset_idx: + # Could not get row idx with csc and vice versa working without reading into memory but shouldn't matter + if (self.format_str == 'csr' and self.has_no_row_subset_idx) or (self.format_str == 'csc' and self.has_no_col_subset_idx): format_class = get_memory_class(self.format_str) - mtx = format_class(self.shape, dtype=self.dtype) + mtx = format_class(self.get_backing_shape(), dtype=self.dtype) mtx.data = self.group["data"][...] mtx.indices = self.group["indices"][...] mtx.indptr = self.group["indptr"][...] - return mtx - mtx = self.to_backed() + if self.has_no_subset_idx: + return mtx + else: + mtx = self.to_backed() if self.format_str == 'csr': - mtx = mtx[self.row_subset_idx, :] - return mtx[:, self.col_subset_idx] - mtx = mtx[:, self.col_subset_idx] - return mtx[self.row_subset_idx, :] + return mtx[self.row_subset_idx, :][:, self.col_subset_idx] + return mtx[:, self.col_subset_idx][self.row_subset_idx, :] def toarray(self) -> np.ndarray: return self.to_memory().toarray() diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index a246f2583..989ae427e 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -305,27 +305,50 @@ def test_read_view(tmp_path, mtx_format): orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) remote = read_backed(orig_pth) - subset = adata.obs["obs_cat"] == "a" + subset_obs = adata.obs["obs_cat"] == "a" if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: assert np.all( - asarray(adata[subset, :].X) == asarray(remote[subset, :].X.to_memory()) + asarray(adata[subset_obs, :].X) == asarray(remote[subset_obs, :].X.to_memory()) ) else: assert np.all( - asarray(adata[subset, :].X) == asarray(remote[subset, :].X.compute()) + asarray(adata[subset_obs, :].X) == asarray(remote[subset_obs, :].X.compute()) ) assert ( - (adata[subset, :].obs == remote[subset, :].obs.to_df()[adata.obs.columns]) + (adata[subset_obs, :].obs == remote[subset_obs, :].obs.to_df()[adata.obs.columns]) .all() .all() ) assert ( - (adata[subset, :].var == remote[subset, :].var.to_df()[adata.var.columns]) + (adata[subset_obs, :].var == remote[subset_obs, :].var.to_df()[adata.var.columns]) .all() .all() ) assert ( - adata[subset, :].obsm["array"] == remote[subset, :].obsm["array"].compute() + adata[subset_obs, :].obsm["array"] == remote[subset_obs, :].obsm["array"].compute() + ).all() + + subset_var = adata.var["var_cat"] == "a" + if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: + assert np.all( + asarray(adata[:, subset_var].X) == asarray(remote[:, subset_var].X.to_memory()) + ) + else: + assert np.all( + asarray(adata[:, subset_var].X) == asarray(remote[:, subset_var].X.compute()) + ) + assert ( + (adata[:, subset_var].obs == remote[:, subset_var].obs.to_df()[adata.obs.columns]) + .all() + .all() + ) + assert ( + (adata[:, subset_var].var == remote[:, subset_var].var.to_df()[adata.var.columns]) + .all() + .all() + ) + assert ( + adata[:, subset_var].obsm["array"] == remote[:, subset_var].obsm["array"].compute() ).all() @@ -335,24 +358,59 @@ def test_read_view_of_view(tmp_path, mtx_format): orig_pth = base_pth / "orig.zarr" adata.write_zarr(orig_pth) remote = read_backed(orig_pth) - subset = (adata.obs["obs_cat"] == "a") | (adata.obs["obs_cat"] == "b") - subsetted_adata = adata[subset, :] - subset_subset = subsetted_adata.obs["obs_cat"] == "b" - subsetted_subsetted_adata = subsetted_adata[subset_subset, :] + subset_obs = (adata.obs["obs_cat"] == "a") | (adata.obs["obs_cat"] == "b") + subsetted_adata = adata[subset_obs, :] + subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" + subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] + if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: + assert np.all( + asarray(subsetted_subsetted_adata.X) + == asarray(remote[subset_obs, :][subset_subset_obs, :].X.to_memory()) + ) + else: + assert np.all( + asarray(subsetted_subsetted_adata.X) + == asarray(remote[subset_obs, :][subset_subset_obs, :].X.compute()) + ) + assert ( + ( + subsetted_subsetted_adata.obs + == remote[subset_obs, :][subset_subset_obs, :].obs.to_df()[adata.obs.columns] + ) + .all() + .all() + ) + assert ( + ( + subsetted_subsetted_adata.var + == remote[subset_obs, :][subset_subset_obs, :].var.to_df()[adata.var.columns] + ) + .all() + .all() + ) + assert ( + subsetted_subsetted_adata.obsm["array"] + == remote[subset_obs, :][subset_subset_obs, :].obsm["array"].compute() + ).all() + + subset_var = (adata.var["var_cat"] == "a") | (adata.var["var_cat"] == "b") + subsetted_adata = adata[:, subset_var] + subset_subset_var = subsetted_adata.var["var_cat"] == "b" + subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: assert np.all( asarray(subsetted_subsetted_adata.X) - == asarray(remote[subset, :][subset_subset, :].X.to_memory()) + == asarray(remote[:, subset_var][:, subset_subset_var].X.to_memory()) ) else: assert np.all( asarray(subsetted_subsetted_adata.X) - == asarray(remote[subset, :][subset_subset, :].X.compute()) + == asarray(remote[:, subset_var][:, subset_subset_var].X.compute()) ) assert ( ( subsetted_subsetted_adata.obs - == remote[subset, :][subset_subset, :].obs.to_df()[adata.obs.columns] + == remote[:, subset_var][:, subset_subset_var].obs.to_df()[adata.obs.columns] ) .all() .all() @@ -360,14 +418,14 @@ def test_read_view_of_view(tmp_path, mtx_format): assert ( ( subsetted_subsetted_adata.var - == remote[subset, :][subset_subset, :].var.to_df()[adata.var.columns] + == remote[:, subset_var][:, subset_subset_var].var.to_df()[adata.var.columns] ) .all() .all() ) assert ( subsetted_subsetted_adata.obsm["array"] - == remote[subset, :][subset_subset, :].obsm["array"].compute() + == remote[:, subset_var][:, subset_subset_var].obsm["array"].compute() ).all() From 5cedb9ac4a74bde15e2f2af7007bd1961ecedf83 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Thu, 1 Jun 2023 16:21:17 +0200 Subject: [PATCH 104/125] (fix): legacy backed mode --- anndata/_core/anndata.py | 2 ++ anndata/_io/specs/methods.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index dc956e91b..59073e10c 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -602,6 +602,8 @@ def X(self) -> Optional[Union[np.ndarray, sparse.spmatrix, ArrayView]]: # indices that aren’t strictly increasing if self.is_view: X = _subset(X, (self._oidx, self._vidx)) + if isinstance(X, BaseCompressedSparseDataset): + X = X.to_memory() elif self.is_view and self._adata_ref.X is None: X = None elif self.is_view: diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index a4f172716..cf4c2622a 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -484,7 +484,7 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} write_sparse_compressed( f, k, - (elem.to_backed() if elem.has_subset_idx else elem.to_memory()), # if there is a subset on the elem, to_memory lazily reads in __only__ the subset + elem.to_memory(), # if there is a subset on the elem, to_memory lazily reads in __only__ the subset _writer, fmt=elem.format_str, dataset_kwargs=dataset_kwargs, From 627be81f3c54377ae966af79a2a9e5dde4413c8b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 1 Jun 2023 14:29:56 +0000 Subject: [PATCH 105/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/sparse_dataset.py | 38 ++++++------ anndata/_io/specs/methods.py | 2 +- .../read_backed/lazy_axis_arrays.py | 8 ++- .../experimental/read_backed/read_backed.py | 15 ++--- .../tests/test_read_backed_experimental.py | 58 ++++++++++++++----- 5 files changed, 76 insertions(+), 45 deletions(-) diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 0428088f8..7291ecbe1 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -275,24 +275,28 @@ def __init__(self, group: Union[h5py.Group, zarr.Group]): @property def row_subset_idx(self): if isinstance(self._row_subset_idx, np.ndarray): - return self._row_subset_idx.flatten() # why???? + return self._row_subset_idx.flatten() # why???? return self._row_subset_idx - + @property def has_no_subset_idx(self): return self.has_no_col_subset_idx and self.has_no_row_subset_idx - + @property def has_no_col_subset_idx(self): if isinstance(self.col_subset_idx, slice): - if self.col_subset_idx == slice(None, None, None) or self.col_subset_idx == slice(0, self.get_backing_shape()[1], 1): + if self.col_subset_idx == slice( + None, None, None + ) or self.col_subset_idx == slice(0, self.get_backing_shape()[1], 1): return True return False - + @property def has_no_row_subset_idx(self): if isinstance(self.row_subset_idx, slice): - if self.row_subset_idx == slice(None, None, None) or self.row_subset_idx == slice(0, self.get_backing_shape()[0], 1): + if self.row_subset_idx == slice( + None, None, None + ) or self.row_subset_idx == slice(0, self.get_backing_shape()[0], 1): return True return False @@ -307,7 +311,7 @@ def row_subset_idx(self, new_idx): @property def col_subset_idx(self): if isinstance(self._col_subset_idx, np.ndarray): - return self._col_subset_idx.flatten() + return self._col_subset_idx.flatten() return self._col_subset_idx @col_subset_idx.setter @@ -350,19 +354,15 @@ def shape(self) -> Tuple[int, int]: row_length = shape[0] else: row_length = self.row_subset_idx.stop - self.row_subset_idx.start - else: - row_length = len( - self.row_subset_idx - ) # can we assume a flatten method? + else: + row_length = len(self.row_subset_idx) # can we assume a flatten method? if isinstance(self.col_subset_idx, slice): if self.col_subset_idx == slice(None, None, None): col_length = shape[1] else: col_length = self.col_subset_idx.stop - self.col_subset_idx.start - else: - col_length = len( - self.col_subset_idx - ) # can we assume a flatten method? + else: + col_length = len(self.col_subset_idx) # can we assume a flatten method? return (row_length, col_length) @property @@ -473,7 +473,9 @@ def to_backed(self) -> BackedSparseMatrix: def to_memory(self) -> ss.spmatrix: # Could not get row idx with csc and vice versa working without reading into memory but shouldn't matter - if (self.format_str == 'csr' and self.has_no_row_subset_idx) or (self.format_str == 'csc' and self.has_no_col_subset_idx): + if (self.format_str == "csr" and self.has_no_row_subset_idx) or ( + self.format_str == "csc" and self.has_no_col_subset_idx + ): format_class = get_memory_class(self.format_str) mtx = format_class(self.get_backing_shape(), dtype=self.dtype) mtx.data = self.group["data"][...] @@ -483,10 +485,10 @@ def to_memory(self) -> ss.spmatrix: return mtx else: mtx = self.to_backed() - if self.format_str == 'csr': + if self.format_str == "csr": return mtx[self.row_subset_idx, :][:, self.col_subset_idx] return mtx[:, self.col_subset_idx][self.row_subset_idx, :] - + def toarray(self) -> np.ndarray: return self.to_memory().toarray() diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index cf4c2622a..406871e32 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -484,7 +484,7 @@ def write_sparse_dataset(f, k, elem, _writer, dataset_kwargs=MappingProxyType({} write_sparse_compressed( f, k, - elem.to_memory(), # if there is a subset on the elem, to_memory lazily reads in __only__ the subset + elem.to_memory(), # if there is a subset on the elem, to_memory lazily reads in __only__ the subset _writer, fmt=elem.format_str, dataset_kwargs=dataset_kwargs, diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 2581a6604..132af59f7 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -22,8 +22,10 @@ def to_df_1d_axis_arrays(axis_arrays: AxisArrays, exclude=[]): """Convert to pandas dataframe.""" df = pd.DataFrame(index=axis_arrays.dim_names[...]) for key in axis_arrays.keys(): - full_key = axis_arrays.attrname + '/' + key - if "index" not in key and all([full_key != exclude_key for exclude_key in exclude]): + full_key = axis_arrays.attrname + "/" + key + if "index" not in key and all( + [full_key != exclude_key for exclude_key in exclude] + ): df[key] = axis_arrays[key][...] return df @@ -53,7 +55,7 @@ def _repr_html_(self): def _repr_latex_(self): return self.__repr__() - + @property def attrname(self) -> str: return self.dim diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 80ae4513d..e78c0dfd3 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -167,7 +167,7 @@ def to_memory(self, exclude=[]): def backed_dict_to_memory(d, prefix): res = {} for k, v in d.items(): - full_key = prefix + '/' + k + full_key = prefix + "/" + k if any([full_key == exclude_key for exclude_key in exclude]): continue if isinstance(v, DaskArray): @@ -181,15 +181,16 @@ def backed_dict_to_memory(d, prefix): else: res[k] = v return res + obs = self.obs.to_df(exclude) var = self.var.to_df(exclude) - obsm = backed_dict_to_memory(dict(self.obsm), 'obsm') - varm = backed_dict_to_memory(dict(self.varm), 'varm') - varp = backed_dict_to_memory(dict(self.varp), 'varp') - obsp = backed_dict_to_memory(dict(self.obsp), 'obsp') - layers = backed_dict_to_memory(dict(self.layers), 'layers') + obsm = backed_dict_to_memory(dict(self.obsm), "obsm") + varm = backed_dict_to_memory(dict(self.varm), "varm") + varp = backed_dict_to_memory(dict(self.varp), "varp") + obsp = backed_dict_to_memory(dict(self.obsp), "obsp") + layers = backed_dict_to_memory(dict(self.layers), "layers") X = None - if 'X' not in exclude: + if "X" not in exclude: if isinstance(self.X, BaseCompressedSparseDataset): X = self.X.to_memory() else: diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 989ae427e..fabd09d15 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -308,47 +308,65 @@ def test_read_view(tmp_path, mtx_format): subset_obs = adata.obs["obs_cat"] == "a" if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: assert np.all( - asarray(adata[subset_obs, :].X) == asarray(remote[subset_obs, :].X.to_memory()) + asarray(adata[subset_obs, :].X) + == asarray(remote[subset_obs, :].X.to_memory()) ) else: assert np.all( - asarray(adata[subset_obs, :].X) == asarray(remote[subset_obs, :].X.compute()) + asarray(adata[subset_obs, :].X) + == asarray(remote[subset_obs, :].X.compute()) ) assert ( - (adata[subset_obs, :].obs == remote[subset_obs, :].obs.to_df()[adata.obs.columns]) + ( + adata[subset_obs, :].obs + == remote[subset_obs, :].obs.to_df()[adata.obs.columns] + ) .all() .all() ) assert ( - (adata[subset_obs, :].var == remote[subset_obs, :].var.to_df()[adata.var.columns]) + ( + adata[subset_obs, :].var + == remote[subset_obs, :].var.to_df()[adata.var.columns] + ) .all() .all() ) assert ( - adata[subset_obs, :].obsm["array"] == remote[subset_obs, :].obsm["array"].compute() + adata[subset_obs, :].obsm["array"] + == remote[subset_obs, :].obsm["array"].compute() ).all() subset_var = adata.var["var_cat"] == "a" if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: assert np.all( - asarray(adata[:, subset_var].X) == asarray(remote[:, subset_var].X.to_memory()) + asarray(adata[:, subset_var].X) + == asarray(remote[:, subset_var].X.to_memory()) ) else: assert np.all( - asarray(adata[:, subset_var].X) == asarray(remote[:, subset_var].X.compute()) + asarray(adata[:, subset_var].X) + == asarray(remote[:, subset_var].X.compute()) ) assert ( - (adata[:, subset_var].obs == remote[:, subset_var].obs.to_df()[adata.obs.columns]) + ( + adata[:, subset_var].obs + == remote[:, subset_var].obs.to_df()[adata.obs.columns] + ) .all() .all() ) assert ( - (adata[:, subset_var].var == remote[:, subset_var].var.to_df()[adata.var.columns]) + ( + adata[:, subset_var].var + == remote[:, subset_var].var.to_df()[adata.var.columns] + ) .all() .all() ) assert ( - adata[:, subset_var].obsm["array"] == remote[:, subset_var].obsm["array"].compute() + adata[:, subset_var].obsm["array"] + == remote[:, subset_var].obsm["array"].compute() ).all() @@ -375,7 +393,9 @@ def test_read_view_of_view(tmp_path, mtx_format): assert ( ( subsetted_subsetted_adata.obs - == remote[subset_obs, :][subset_subset_obs, :].obs.to_df()[adata.obs.columns] + == remote[subset_obs, :][subset_subset_obs, :].obs.to_df()[ + adata.obs.columns + ] ) .all() .all() @@ -383,7 +403,9 @@ def test_read_view_of_view(tmp_path, mtx_format): assert ( ( subsetted_subsetted_adata.var - == remote[subset_obs, :][subset_subset_obs, :].var.to_df()[adata.var.columns] + == remote[subset_obs, :][subset_subset_obs, :].var.to_df()[ + adata.var.columns + ] ) .all() .all() @@ -410,7 +432,9 @@ def test_read_view_of_view(tmp_path, mtx_format): assert ( ( subsetted_subsetted_adata.obs - == remote[:, subset_var][:, subset_subset_var].obs.to_df()[adata.obs.columns] + == remote[:, subset_var][:, subset_subset_var].obs.to_df()[ + adata.obs.columns + ] ) .all() .all() @@ -418,7 +442,9 @@ def test_read_view_of_view(tmp_path, mtx_format): assert ( ( subsetted_subsetted_adata.var - == remote[:, subset_var][:, subset_subset_var].var.to_df()[adata.var.columns] + == remote[:, subset_var][:, subset_subset_var].var.to_df()[ + adata.var.columns + ] ) .all() .all() @@ -483,8 +509,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert (nullable_boolean_lazy_arr_no_mask[0] is True).all() + assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From b330443d98198eabfc496c69b63b08725eedae59 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 6 Jun 2023 14:31:33 +0200 Subject: [PATCH 106/125] (fix): remove `all` --- anndata/tests/test_read_backed_experimental.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 989ae427e..74c56c0fe 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -483,7 +483,7 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert (nullable_boolean_lazy_arr_no_mask[0] == True).all() + assert nullable_boolean_lazy_arr_no_mask[0] == True assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 64f178ddabef475e70508ad7db299faa163daea5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 14 Jun 2023 11:49:26 +0200 Subject: [PATCH 107/125] (chore): add docstrings --- .../experimental/read_backed/lazy_arrays.py | 43 ++++++++++++++++--- .../read_backed/lazy_axis_arrays.py | 20 ++++++++- .../experimental/read_backed/read_backed.py | 10 +++++ 3 files changed, 65 insertions(+), 8 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 3565ac10d..70fb10719 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -8,7 +8,15 @@ class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): - def _resolve_idx(self, new_idx): + def _resolve_idx(self, new_idx: Index) -> Index: + """Wrapper for resolving the idx against (potentially) already existing `self.subset_idx` + + Args: + new_idx (Index): new indices + + Returns: + Index: The resolved idx, an intersection of new_idx and `self.subset_idx` + """ return ( new_idx if self.subset_idx is None @@ -16,7 +24,12 @@ def _resolve_idx(self, new_idx): ) @property - def subset_idx(self): + def subset_idx(self) -> Index: + """A local + + Returns: + Index: The indices for this array + """ return self._subset_idx @subset_idx.setter @@ -25,6 +38,11 @@ def subset_idx(self, new_idx): @property def shape(self) -> Tuple[int, ...]: + """Shape of this array + + Returns: + Tuple[int, ...]: A shape that looks like a 1-d shape i.e., (#, ) + """ if self.subset_idx is None: return self.values.shape if isinstance(self.subset_idx, slice): @@ -55,7 +73,9 @@ def __init__(self, codes, categories, attrs, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group Args: - group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr + codes (zarr.Array): values (integers) of the array, one for each element + categories (zarr.Array): mappings from values to strings + attrs (zarr.Array): attrs containing boolean "ordered" """ self.values = codes self._categories = categories @@ -91,7 +111,12 @@ def __getitem__(self, selection) -> pd.Categorical: def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" - def copy(self): + def copy(self) -> "LazyCategoricalArray": + """Returns a copy of this array which can then be safely edited + + Returns: + LazyCategoricalArray: copied LazyCategoricalArray + """ arr = LazyCategoricalArray( self.values, self._categories, self.attrs ) # self.categories reads in data @@ -106,7 +131,8 @@ def __init__(self, values, mask, dtype_str, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group Args: - group (zarr.Group): group containing "codes" and "categories" key as well as "ordered" attr + values (zarr.Array): Integer/Boolean array of values + mask (zarr.Array): mask indicating which values are non-null dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` """ self.values = values @@ -142,7 +168,12 @@ def __repr__(self) -> str: elif self._dtype_str == "nullable-boolean": return "LazyNullableBooleanArray" - def copy(self): + def copy(self) -> "LazyMaskedArray": + """Returns a copy of this array which can then be safely edited + + Returns: + LazyMaskedArray: copied LazyMaskedArray + """ arr = LazyMaskedArray(self.values, self.mask, self._dtype_str) arr.subset_idx = self.subset_idx return arr diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 132af59f7..52d1cf7e7 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -18,8 +18,16 @@ def columns(self) -> List: return list(self.keys()) -def to_df_1d_axis_arrays(axis_arrays: AxisArrays, exclude=[]): - """Convert to pandas dataframe.""" +def to_df_1d_axis_arrays(axis_arrays: AxisArrays, exclude=[]) -> pd.DataFrame: + """Convert a axis array to dataframe in mememort + + Args: + axis_arrays (AxisArrays): AxisArrays to be converted + exclude (list, optional): Keys to exclude from being loaded into the DataFrame/Memory. Defaults to []. + + Returns: + pd.DataFrame: Potential subset of `axis_arrays` in memory + """ df = pd.DataFrame(index=axis_arrays.dim_names[...]) for key in axis_arrays.keys(): full_key = axis_arrays.attrname + "/" + key @@ -32,6 +40,14 @@ def to_df_1d_axis_arrays(axis_arrays: AxisArrays, exclude=[]): class AxisArraysRemote1dMixin: def to_df(self, exclude=[]) -> pd.DataFrame: + """Convert to a DataFrame + + Args: + exclude (list, optional): Keys to exclude from being loaded into the DataFrame/Memory. Defaults to []. + + Returns: + pd.DataFrame: Potential subset of `axis_arrays` in memory + """ return to_df_1d_axis_arrays(self, exclude) @property diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index e78c0dfd3..824ace6bc 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -361,6 +361,16 @@ def __repr__(self): def read_backed(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: + """Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned. + No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. + + Args: + store (Union[str, Path, MutableMapping, zarr.Group]): A store-like object to be read in. If `zarr`, it is best + for it to be consolidated. + + Returns: + AnnData: A lazily read-in AnnData object. + """ if isinstance(store, Path): store = str(store) From 379c0222a6d5f277b670ad0971957424b8511ef9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Jun 2023 10:02:08 +0000 Subject: [PATCH 108/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/tests/test_read_backed_experimental.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 12118da01..3760b5485 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -509,8 +509,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert nullable_boolean_lazy_arr_no_mask[0] == True - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert nullable_boolean_lazy_arr_no_mask[0] is True + assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 716162dac9abbaec915485f0a65c41c81bb0b475 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 14 Jun 2023 17:16:30 +0200 Subject: [PATCH 109/125] (fix): h5py support --- .../experimental/read_backed/lazy_arrays.py | 22 ++++++++---- .../experimental/read_backed/read_backed.py | 36 +++++++++++++------ .../tests/test_read_backed_experimental.py | 35 ++++++++++-------- 3 files changed, 61 insertions(+), 32 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 70fb10719..630b4393d 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -1,6 +1,8 @@ from typing import Tuple from anndata._core.index import Index, _subset from anndata._core.views import _resolve_idx, as_view +from anndata._io.h5ad import read_dataset +from anndata.compat import ZarrArray import pandas as pd import numpy as np @@ -73,9 +75,9 @@ def __init__(self, codes, categories, attrs, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group Args: - codes (zarr.Array): values (integers) of the array, one for each element - categories (zarr.Array): mappings from values to strings - attrs (zarr.Array): attrs containing boolean "ordered" + codes (Union[zarr.Array, h5py.Dataset]): values (integers) of the array, one for each element + categories (Union[zarr.Array, h5py.Dataset]): mappings from values to strings + attrs (Union[zarr.Array, h5py.Dataset]): attrs containing boolean "ordered" """ self.values = codes self._categories = categories @@ -86,7 +88,10 @@ def __init__(self, codes, categories, attrs, *args, **kwargs): @property def categories(self): # __slots__ and cached_property are incompatible if self._categories_cache is None: - self._categories_cache = self._categories[...] + if isinstance(self._categories, ZarrArray): + self._categories_cache = self._categories[...] + else: + self._categories_cache = read_dataset(self._categories) return self._categories_cache @property @@ -99,7 +104,10 @@ def ordered(self): def __getitem__(self, selection) -> pd.Categorical: idx = self._resolve_idx(selection) - codes = self.values.oindex[idx] + if isinstance(self.values, ZarrArray): + codes = self.values.oindex[idx] + else: + codes = self.values[idx] if codes.shape == (): # handle 0d case codes = np.array([codes]) return pd.Categorical.from_codes( @@ -131,8 +139,8 @@ def __init__(self, values, mask, dtype_str, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group Args: - values (zarr.Array): Integer/Boolean array of values - mask (zarr.Array): mask indicating which values are non-null + values (Union[zarr.Array, h5py.Dataset]): Integer/Boolean array of values + mask (Union[zarr.Array, h5py.Dataset]): mask indicating which values are non-null dtype_str (Nullable): one of `nullable-integer` or `nullable-boolean` """ self.values = values diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 824ace6bc..26c800aa5 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -8,6 +8,7 @@ Tuple, ) +import h5py from anndata._core.aligned_mapping import ( Layers, PairwiseArrays, @@ -18,6 +19,7 @@ from anndata._core.raw import Raw from anndata._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from anndata._core.views import _resolve_idxs +from anndata._io.h5ad import read_dataset from anndata.compat import DaskArray from anndata.utils import asarray, convert_to_dict @@ -360,33 +362,39 @@ def __repr__(self): return descr -def read_backed(store: Union[str, Path, MutableMapping, zarr.Group]) -> AnnData: +def read_backed(store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]) -> AnnData: """Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned. No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. Args: - store (Union[str, Path, MutableMapping, zarr.Group]): A store-like object to be read in. If `zarr`, it is best + store (Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]): A store-like object to be read in. If `zarr`, it is best for it to be consolidated. Returns: AnnData: A lazily read-in AnnData object. """ - if isinstance(store, Path): + is_h5 = False + if isinstance(store, Path) or isinstance(store, str): store = str(store) - - is_consolidated = True - try: - f = zarr.open_consolidated(store, mode="r") - except KeyError: - is_consolidated = False - f = zarr.open(store, mode="r") + if store.endswith('h5ad'): + is_h5 = True + + has_keys = True # true if consolidated or h5ad + if not is_h5: + try: + f = zarr.open_consolidated(store, mode="r") + except KeyError: + has_keys = False + f = zarr.open(store, mode="r") + else: + f = h5py.File(store, mode="r") def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"] iter_object = ( elem.items() - if is_consolidated + if has_keys else [(k, elem[k]) for k in cols if k in elem] ) return AnnDataBacked( @@ -408,6 +416,12 @@ def callback(func, elem_name: str, elem, iospec): iospec.encoding_type, ) elif iospec.encoding_type in {"array", "string-array"}: + if is_h5: + if iospec.encoding_type == "string-array": + elem = read_dataset(elem) + if not hasattr(elem, "chunks") or elem.chunks is None: + return da.from_array(elem, chunks=(1000,) * len(elem.shape)) + return da.from_array(elem) return da.from_zarr(elem) elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: return sparse_dataset(elem) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 3760b5485..69ba4cc74 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -54,6 +54,9 @@ def mtx_format(request): def sparse_format(request): return request.param +@pytest.fixture(params=["zarr", "h5ad"]) +def dskfmt(request): + return request.param @pytest.fixture() def categorical_lazy_arr(tmp_path_factory): @@ -274,11 +277,12 @@ def test_access_count_obsp_varp(tmp_path, mtx_format): assert store.get_access_count("varp") == 0 -def test_read_full(tmp_path, mtx_format): +def test_read_full(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) - orig_pth = base_pth / "orig.zarr" - adata.write_zarr(orig_pth) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) remote = read_backed(orig_pth) if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: assert np.all(asarray(adata.X) == asarray(remote.X.to_memory())) @@ -289,21 +293,23 @@ def test_read_full(tmp_path, mtx_format): assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() -def test_to_memory(tmp_path, mtx_format): +def test_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) - orig_pth = base_pth / "orig.zarr" - adata.write_zarr(orig_pth) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) remote = read_backed(orig_pth) remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) -def test_read_view(tmp_path, mtx_format): +def test_read_view(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) - orig_pth = base_pth / "orig.zarr" - adata.write_zarr(orig_pth) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) remote = read_backed(orig_pth) subset_obs = adata.obs["obs_cat"] == "a" if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: @@ -370,11 +376,12 @@ def test_read_view(tmp_path, mtx_format): ).all() -def test_read_view_of_view(tmp_path, mtx_format): +def test_read_view_of_view(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) - orig_pth = base_pth / "orig.zarr" - adata.write_zarr(orig_pth) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) remote = read_backed(orig_pth) subset_obs = (adata.obs["obs_cat"] == "a") | (adata.obs["obs_cat"] == "b") subsetted_adata = adata[subset_obs, :] @@ -509,8 +516,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert nullable_boolean_lazy_arr_no_mask[0] is True - assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() + assert nullable_boolean_lazy_arr_no_mask[0] == True + assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From bdb93a2ac53b0b4a8f52e6bc46ca7d31f40ddca7 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 17 Jul 2023 16:09:19 +0200 Subject: [PATCH 110/125] (feat): migrate `Dask` to `DataArray` --- anndata/experimental/read_backed/lazy_arrays.py | 9 +++++++++ anndata/experimental/read_backed/read_backed.py | 12 +++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 630b4393d..c9c4caf45 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -7,6 +7,7 @@ import pandas as pd import numpy as np from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin +import xarray as xr class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): @@ -187,6 +188,10 @@ def copy(self) -> "LazyMaskedArray": return arr +@_subset.register(xr.DataArray) +def _subset_masked(a: xr.DataArray, subset_idx: Index): + return a[subset_idx] + @_subset.register(MaskedArrayMixIn) def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): a_copy = a.copy() @@ -217,3 +222,7 @@ def _view_pd_integer_array(a: pd.arrays.IntegerArray, view_args): @as_view.register(pd.arrays.BooleanArray) def _view_pd_boolean_array(a: pd.arrays.BooleanArray, view_args): return a + +@as_view.register(xr.DataArray) +def _view_pd_boolean_array(a: xr.DataArray, view_args): + return a diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 26c800aa5..446b31d15 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -24,6 +24,7 @@ from anndata.utils import asarray, convert_to_dict import zarr +import xarray as xr import pandas as pd import dask.array as da from scipy import sparse @@ -119,6 +120,7 @@ def __init__( self.var_names = var[self.file["var"].attrs["_index"]] if vidx is not None: self.var_names = self.var_names[vidx] + self.obs = AxisArrays1dRemote(adata_ref, 0, vals=convert_to_dict(obs)) self.var = AxisArrays1dRemote(adata_ref, 1, vals=convert_to_dict(var)) @@ -406,7 +408,15 @@ def callback(func, elem_name: str, elem, iospec): iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ (elem.attrs["_index"], elem[elem.attrs["_index"]]) ] - return {k: read_dispatched(v, callback) for k, v in iter_object} + d = {k: read_dispatched(v, callback) for k, v in iter_object} + d_with_xr = {} + for k in d: + v = d[k] + if type(v) == DaskArray and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray(v, coords=[d[elem.attrs["_index"]]], dims=[f'{elem_name.replace("/", "")}_names'], name=k) + else: + d_with_xr[k] = v + return d_with_xr elif iospec.encoding_type == "categorical": return LazyCategoricalArray(elem["codes"], elem["categories"], elem.attrs) elif "nullable" in iospec.encoding_type: From f78a59bf4d3ccbb3b4eb5ace922a4f4e84f4c8cd Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Mon, 17 Jul 2023 17:18:47 +0200 Subject: [PATCH 111/125] (feat): catgoricals now using `DataArray` as well. --- .../experimental/read_backed/lazy_arrays.py | 79 ++++--------------- .../read_backed/lazy_axis_arrays.py | 2 +- .../experimental/read_backed/read_backed.py | 8 +- .../tests/test_read_backed_experimental.py | 2 +- 4 files changed, 24 insertions(+), 67 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index c9c4caf45..52b5e515e 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -6,39 +6,18 @@ import pandas as pd import numpy as np -from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin +from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin, BasicIndexer, OuterIndexer import xarray as xr class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): - def _resolve_idx(self, new_idx: Index) -> Index: - """Wrapper for resolving the idx against (potentially) already existing `self.subset_idx` - Args: - new_idx (Index): new indices - - Returns: - Index: The resolved idx, an intersection of new_idx and `self.subset_idx` - """ - return ( - new_idx - if self.subset_idx is None - else _resolve_idx(self.subset_idx, new_idx, self.shape[0]) - ) - - @property - def subset_idx(self) -> Index: - """A local - - Returns: - Index: The indices for this array - """ - return self._subset_idx - - @subset_idx.setter - def subset_idx(self, new_idx): - self._subset_idx = self._resolve_idx(new_idx) + def __eq__(self, __o) -> np.ndarray: + return self[...] == __o + def __ne__(self, __o) -> np.ndarray: + return ~(self == __o) + @property def shape(self) -> Tuple[int, ...]: """Shape of this array @@ -46,20 +25,7 @@ def shape(self) -> Tuple[int, ...]: Returns: Tuple[int, ...]: A shape that looks like a 1-d shape i.e., (#, ) """ - if self.subset_idx is None: - return self.values.shape - if isinstance(self.subset_idx, slice): - if self.subset_idx == slice(None, None, None): - return self.values.shape - return (self.subset_idx.stop - self.subset_idx.start,) - else: - return (len(self.subset_idx),) - - def __eq__(self, __o) -> np.ndarray: - return self[...] == __o - - def __ne__(self, __o) -> np.ndarray: - return ~(self == __o) + return self.values.shape class LazyCategoricalArray(MaskedArrayMixIn): @@ -68,12 +34,11 @@ class LazyCategoricalArray(MaskedArrayMixIn): "attrs", "_categories", "_categories_cache", - "_subset_idx", "group", ) def __init__(self, codes, categories, attrs, *args, **kwargs): - """Class for lazily reading categorical data from formatted zarr group + """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. Args: codes (Union[zarr.Array, h5py.Dataset]): values (integers) of the array, one for each element @@ -83,7 +48,6 @@ def __init__(self, codes, categories, attrs, *args, **kwargs): self.values = codes self._categories = categories self._categories_cache = None - self._subset_idx = None self.attrs = dict(attrs) @property @@ -104,7 +68,9 @@ def ordered(self): return bool(self.attrs["ordered"]) def __getitem__(self, selection) -> pd.Categorical: - idx = self._resolve_idx(selection) + idx = selection + if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): + idx = selection.tuple[0] # need to better understand this if isinstance(self.values, ZarrArray): codes = self.values.oindex[idx] else: @@ -129,15 +95,14 @@ def copy(self) -> "LazyCategoricalArray": arr = LazyCategoricalArray( self.values, self._categories, self.attrs ) # self.categories reads in data - arr.subset_idx = self.subset_idx return arr class LazyMaskedArray(MaskedArrayMixIn): - __slots__ = ("mask", "values", "_subset_idx", "_dtype_str") + __slots__ = ("mask", "values", "_dtype_str") def __init__(self, values, mask, dtype_str, *args, **kwargs): - """Class for lazily reading categorical data from formatted zarr group + """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. Args: values (Union[zarr.Array, h5py.Dataset]): Integer/Boolean array of values @@ -146,7 +111,6 @@ def __init__(self, values, mask, dtype_str, *args, **kwargs): """ self.values = values self.mask = mask - self._subset_idx = None self._dtype_str = dtype_str @property @@ -159,7 +123,9 @@ def dtype(self) -> pd.CategoricalDtype: return pd.array def __getitem__(self, selection) -> pd.Categorical: - idx = self._resolve_idx(selection) + idx = selection + if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): + idx = selection.tuple[0] # need to understand this better if type(idx) == int: idx = slice(idx, idx + 1) values = np.array(self.values[idx]) @@ -184,7 +150,6 @@ def copy(self) -> "LazyMaskedArray": LazyMaskedArray: copied LazyMaskedArray """ arr = LazyMaskedArray(self.values, self.mask, self._dtype_str) - arr.subset_idx = self.subset_idx return arr @@ -192,18 +157,6 @@ def copy(self) -> "LazyMaskedArray": def _subset_masked(a: xr.DataArray, subset_idx: Index): return a[subset_idx] -@_subset.register(MaskedArrayMixIn) -def _subset_masked(a: MaskedArrayMixIn, subset_idx: Index): - a_copy = a.copy() - a_copy.subset_idx = subset_idx - return a_copy - - -@as_view.register(MaskedArrayMixIn) -def _view_masked(a: MaskedArrayMixIn, view_args): - return a - - @as_view.register(pd.Categorical) def _view_pd_categorical(a: pd.Categorical, view_args): return a diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py index 52d1cf7e7..5e254c6c5 100644 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ b/anndata/experimental/read_backed/lazy_axis_arrays.py @@ -34,7 +34,7 @@ def to_df_1d_axis_arrays(axis_arrays: AxisArrays, exclude=[]) -> pd.DataFrame: if "index" not in key and all( [full_key != exclude_key for exclude_key in exclude] ): - df[key] = axis_arrays[key][...] + df[key] = axis_arrays[key].data # all xarray DataArrays? return df diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 446b31d15..f072f494d 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -174,14 +174,16 @@ def backed_dict_to_memory(d, prefix): full_key = prefix + "/" + k if any([full_key == exclude_key for exclude_key in exclude]): continue + if isinstance(v, xr.DataArray): + v = v.data if isinstance(v, DaskArray): res[k] = v.compute() + elif isinstance(v, BaseCompressedSparseDataset): + res[k] = v.to_memory() elif isinstance(v, LazyCategoricalArray) or isinstance( v, LazyMaskedArray ): res[k] = v[...] - elif isinstance(v, BaseCompressedSparseDataset): - res[k] = v.to_memory() else: res[k] = v return res @@ -414,6 +416,8 @@ def callback(func, elem_name: str, elem, iospec): v = d[k] if type(v) == DaskArray and k != elem.attrs["_index"]: d_with_xr[k] = xr.DataArray(v, coords=[d[elem.attrs["_index"]]], dims=[f'{elem_name.replace("/", "")}_names'], name=k) + elif (type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray) and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray(xr.core.indexing.LazilyIndexedArray(v), coords=[d[elem.attrs["_index"]]], dims=[f'{elem_name.replace("/", "")}_names'], name=k) else: d_with_xr[k] = v return d_with_xr diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 69ba4cc74..59b2f0056 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -209,7 +209,7 @@ def test_access_count_obs_var(tmp_path, mtx_format): remote.obs["int64"] remote.var["int64"] # only the `cat` should be read in - subset = remote[remote.obs["cat"] == "a", :] + subset = remote[(remote.obs["cat"] == "a").data, :] # `.data` for xarray, but should we handle internally? subset.obs["int64"] sub_subset = subset[0:10, :] sub_subset.obs["int64"] From bc2891047e58e47fabe950c485a671362e31a1d2 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Jul 2023 13:57:29 +0200 Subject: [PATCH 112/125] (feat): xarray `Dataset` for `obs`/`var` --- .../experimental/read_backed/lazy_arrays.py | 19 --- .../read_backed/lazy_axis_arrays.py | 90 ----------- .../experimental/read_backed/read_backed.py | 66 +++++--- .../tests/test_read_backed_experimental.py | 152 +----------------- 4 files changed, 46 insertions(+), 281 deletions(-) delete mode 100644 anndata/experimental/read_backed/lazy_axis_arrays.py diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 52b5e515e..8b6a40f8e 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -157,25 +157,6 @@ def copy(self) -> "LazyMaskedArray": def _subset_masked(a: xr.DataArray, subset_idx: Index): return a[subset_idx] -@as_view.register(pd.Categorical) -def _view_pd_categorical(a: pd.Categorical, view_args): - return a - - -@as_view.register(pd.api.extensions.ExtensionArray) -def _view_pd_array(a: pd.api.extensions.ExtensionArray, view_args): - return a - - -@as_view.register(pd.arrays.IntegerArray) -def _view_pd_integer_array(a: pd.arrays.IntegerArray, view_args): - return a - - -@as_view.register(pd.arrays.BooleanArray) -def _view_pd_boolean_array(a: pd.arrays.BooleanArray, view_args): - return a - @as_view.register(xr.DataArray) def _view_pd_boolean_array(a: xr.DataArray, view_args): return a diff --git a/anndata/experimental/read_backed/lazy_axis_arrays.py b/anndata/experimental/read_backed/lazy_axis_arrays.py deleted file mode 100644 index 5e254c6c5..000000000 --- a/anndata/experimental/read_backed/lazy_axis_arrays.py +++ /dev/null @@ -1,90 +0,0 @@ -from typing import Mapping, Union, List -from anndata._core import anndata, raw -from anndata._core.aligned_mapping import AxisArraysBase, AxisArraysView - -import pandas as pd -import numpy as np - -from ..._core import AxisArrays - - -class AxisArraysRemote(AxisArrays): - @property - def dim_names(self) -> pd.Index: - return (self.parent.obs_names, self.parent.var_names)[self._axis].compute() - - @property - def columns(self) -> List: - return list(self.keys()) - - -def to_df_1d_axis_arrays(axis_arrays: AxisArrays, exclude=[]) -> pd.DataFrame: - """Convert a axis array to dataframe in mememort - - Args: - axis_arrays (AxisArrays): AxisArrays to be converted - exclude (list, optional): Keys to exclude from being loaded into the DataFrame/Memory. Defaults to []. - - Returns: - pd.DataFrame: Potential subset of `axis_arrays` in memory - """ - df = pd.DataFrame(index=axis_arrays.dim_names[...]) - for key in axis_arrays.keys(): - full_key = axis_arrays.attrname + "/" + key - if "index" not in key and all( - [full_key != exclude_key for exclude_key in exclude] - ): - df[key] = axis_arrays[key].data # all xarray DataArrays? - return df - - -class AxisArraysRemote1dMixin: - def to_df(self, exclude=[]) -> pd.DataFrame: - """Convert to a DataFrame - - Args: - exclude (list, optional): Keys to exclude from being loaded into the DataFrame/Memory. Defaults to []. - - Returns: - pd.DataFrame: Potential subset of `axis_arrays` in memory - """ - return to_df_1d_axis_arrays(self, exclude) - - @property - def iloc(self): - class IlocDispatch: - def __getitem__(self_iloc, idx): - if type(idx) == list: - return self._view(self.parent, np.array(idx)) - return self._view(self.parent, idx) - - return IlocDispatch() - - def __getattr__(self, __name: str): - # If we a method has been accessed that is not here, try the pandas implementation - if hasattr(pd.DataFrame, __name): - return self.to_df().__getattribute__(__name) - return object.__getattribute__(self, __name) - - def _repr_html_(self): - return self.__repr__() - - def _repr_latex_(self): - return self.__repr__() - - @property - def attrname(self) -> str: - return self.dim - - -class AxisArrays1dRemote(AxisArraysRemote1dMixin, AxisArraysRemote): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -class AxisArrays1dRemoteView(AxisArraysRemote1dMixin, AxisArraysView): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - -AxisArrays1dRemote._view_class = AxisArrays1dRemoteView diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index f072f494d..c083baa8b 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -12,6 +12,7 @@ from anndata._core.aligned_mapping import ( Layers, PairwiseArrays, + AxisArrays ) from anndata._core.anndata import StorageType, _check_2d_shape from anndata._core.anndata_base import AbstractAnnData @@ -31,8 +32,6 @@ from ..._core import AnnData from .. import read_dispatched from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray -from .lazy_axis_arrays import AxisArrays1dRemote, AxisArraysRemote - class AnnDataBacked(AbstractAnnData): def __init__( @@ -114,24 +113,24 @@ def __init__( # annotations - need names already for AxisArrays to work. self.file = file - self.obs_names = obs[self.file["obs"].attrs["_index"]] + self.obs_names = pd.Index(obs['obs_names'].data.compute() if isinstance(obs['obs_names'].data, DaskArray) else obs['obs_names'].data) if oidx is not None: self.obs_names = self.obs_names[oidx] - self.var_names = var[self.file["var"].attrs["_index"]] + self.var_names = pd.Index(var['var_names'].data.compute() if isinstance(var['var_names'].data, DaskArray) else var['var_names'].data) if vidx is not None: self.var_names = self.var_names[vidx] - self.obs = AxisArrays1dRemote(adata_ref, 0, vals=convert_to_dict(obs)) - self.var = AxisArrays1dRemote(adata_ref, 1, vals=convert_to_dict(var)) + self.obs = xr.Dataset(obs) + self.var = xr.Dataset(var) - self.obsm = AxisArraysRemote(adata_ref, 0, vals=convert_to_dict(obsm)) - self.varm = AxisArraysRemote(adata_ref, 1, vals=convert_to_dict(varm)) + self.obsm = AxisArrays(adata_ref, 0, vals=convert_to_dict(obsm)) + self.varm = AxisArrays(adata_ref, 1, vals=convert_to_dict(varm)) self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp)) self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp)) self.layers = Layers(adata_ref, layers) if self.is_view: - self.obs = self.obs._view(self, oidx) - self.var = self.var._view(self, vidx) + self.obs = self.obs.isel(obs_names=oidx) + self.var = self.var.isel(var_names=vidx) self.obsm = self.obsm._view(self, oidx) self.varm = self.varm._view(self, vidx) self.obsp = self.obsp._view(self, oidx) @@ -163,37 +162,49 @@ def __getitem__(self, index: Index) -> "AnnData": def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: return _normalize_indices( index, - pd.Index(self.obs_names.compute()), - pd.Index(self.var_names.compute()), + pd.Index(self.obs_names.compute() if isinstance(self.obs_names, DaskArray) else self.obs_names), # could be either dask or in-memory from xarray + pd.Index(self.var_names.compute() if isinstance(self.var_names, DaskArray) else self.var_names), ) def to_memory(self, exclude=[]): + # handling for AxisArrays def backed_dict_to_memory(d, prefix): res = {} for k, v in d.items(): full_key = prefix + "/" + k if any([full_key == exclude_key for exclude_key in exclude]): continue - if isinstance(v, xr.DataArray): - v = v.data if isinstance(v, DaskArray): res[k] = v.compute() elif isinstance(v, BaseCompressedSparseDataset): res[k] = v.to_memory() - elif isinstance(v, LazyCategoricalArray) or isinstance( - v, LazyMaskedArray - ): - res[k] = v[...] else: res[k] = v return res - obs = self.obs.to_df(exclude) - var = self.var.to_df(exclude) - obsm = backed_dict_to_memory(dict(self.obsm), "obsm") - varm = backed_dict_to_memory(dict(self.varm), "varm") - varp = backed_dict_to_memory(dict(self.varp), "varp") - obsp = backed_dict_to_memory(dict(self.obsp), "obsp") + # nullable and categoricals need special handling because xarray will convert them to numpy arrays first with dtype object + def get_nullable_and_categorical_cols(ds): + cols = [] + for c in ds: + dtype = ds[c].dtype + if isinstance(dtype, pd.CategoricalDtype) or dtype == pd.arrays.BooleanArray or dtype == pd.arrays.IntegerArray: + cols += [c] + return cols + def to_df(ds): + nullable_and_categorical_df_cols = get_nullable_and_categorical_cols(ds) + df = ds.drop_vars(list(set(exclude + nullable_and_categorical_df_cols))).to_dataframe() + for c in nullable_and_categorical_df_cols: + df[c] = ds[c].data[()] + df.index.name = None # matches old AnnData object + df = df[list(ds.keys())] + return df + + obs = to_df(self.obs) + var = to_df(self.var) + obsm = backed_dict_to_memory(convert_to_dict(self.obsm), "obsm") + varm = backed_dict_to_memory(convert_to_dict(self.varm), "varm") + varp = backed_dict_to_memory(convert_to_dict(self.varp), "varp") + obsp = backed_dict_to_memory(convert_to_dict(self.obsp), "obsp") layers = backed_dict_to_memory(dict(self.layers), "layers") X = None if "X" not in exclude: @@ -412,12 +423,15 @@ def callback(func, elem_name: str, elem, iospec): ] d = {k: read_dispatched(v, callback) for k, v in iter_object} d_with_xr = {} + index_label = f'{elem_name.replace("/", "")}_names' for k in d: v = d[k] if type(v) == DaskArray and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray(v, coords=[d[elem.attrs["_index"]]], dims=[f'{elem_name.replace("/", "")}_names'], name=k) + d_with_xr[k] = xr.DataArray(v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k) elif (type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray) and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray(xr.core.indexing.LazilyIndexedArray(v), coords=[d[elem.attrs["_index"]]], dims=[f'{elem_name.replace("/", "")}_names'], name=k) + d_with_xr[k] = xr.DataArray(xr.core.indexing.LazilyIndexedArray(v), coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k) + elif k == elem.attrs["_index"]: + d_with_xr[index_label] = xr.DataArray(v, coords=[v], dims=[index_label], name=index_label) else: d_with_xr[k] = v return d_with_xr diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 59b2f0056..f94b0173b 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -277,22 +277,6 @@ def test_access_count_obsp_varp(tmp_path, mtx_format): assert store.get_access_count("varp") == 0 -def test_read_full(tmp_path, mtx_format, dskfmt): - adata = gen_adata((1000, 1000), mtx_format) - base_pth = Path(tmp_path) - orig_pth = base_pth / f"orig.{dskfmt}" - write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) - write(adata) - remote = read_backed(orig_pth) - if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: - assert np.all(asarray(adata.X) == asarray(remote.X.to_memory())) - else: - assert np.all(asarray(adata.X) == asarray(remote.X.compute())) - assert (adata.obs == remote.obs.to_df()[adata.obs.columns]).all().all() - assert (adata.var == remote.var.to_df()[adata.var.columns]).all().all() - assert (adata.obsm["array"] == remote.obsm["array"].compute()).all() - - def test_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -304,7 +288,7 @@ def test_to_memory(tmp_path, mtx_format, dskfmt): assert_equal(remote_to_memory, adata) -def test_read_view(tmp_path, mtx_format, dskfmt): +def test_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) orig_pth = base_pth / f"orig.{dskfmt}" @@ -312,71 +296,13 @@ def test_read_view(tmp_path, mtx_format, dskfmt): write(adata) remote = read_backed(orig_pth) subset_obs = adata.obs["obs_cat"] == "a" - if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: - assert np.all( - asarray(adata[subset_obs, :].X) - == asarray(remote[subset_obs, :].X.to_memory()) - ) - else: - assert np.all( - asarray(adata[subset_obs, :].X) - == asarray(remote[subset_obs, :].X.compute()) - ) - assert ( - ( - adata[subset_obs, :].obs - == remote[subset_obs, :].obs.to_df()[adata.obs.columns] - ) - .all() - .all() - ) - assert ( - ( - adata[subset_obs, :].var - == remote[subset_obs, :].var.to_df()[adata.var.columns] - ) - .all() - .all() - ) - assert ( - adata[subset_obs, :].obsm["array"] - == remote[subset_obs, :].obsm["array"].compute() - ).all() + assert_equal(adata[subset_obs, :], remote[subset_obs, :].to_memory()) subset_var = adata.var["var_cat"] == "a" - if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: - assert np.all( - asarray(adata[:, subset_var].X) - == asarray(remote[:, subset_var].X.to_memory()) - ) - else: - assert np.all( - asarray(adata[:, subset_var].X) - == asarray(remote[:, subset_var].X.compute()) - ) - assert ( - ( - adata[:, subset_var].obs - == remote[:, subset_var].obs.to_df()[adata.obs.columns] - ) - .all() - .all() - ) - assert ( - ( - adata[:, subset_var].var - == remote[:, subset_var].var.to_df()[adata.var.columns] - ) - .all() - .all() - ) - assert ( - adata[:, subset_var].obsm["array"] - == remote[:, subset_var].obsm["array"].compute() - ).all() + assert_equal(adata[:, subset_var], remote[:, subset_var].to_memory()) -def test_read_view_of_view(tmp_path, mtx_format, dskfmt): +def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) orig_pth = base_pth / f"orig.{dskfmt}" @@ -387,79 +313,13 @@ def test_read_view_of_view(tmp_path, mtx_format, dskfmt): subsetted_adata = adata[subset_obs, :] subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] - if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: - assert np.all( - asarray(subsetted_subsetted_adata.X) - == asarray(remote[subset_obs, :][subset_subset_obs, :].X.to_memory()) - ) - else: - assert np.all( - asarray(subsetted_subsetted_adata.X) - == asarray(remote[subset_obs, :][subset_subset_obs, :].X.compute()) - ) - assert ( - ( - subsetted_subsetted_adata.obs - == remote[subset_obs, :][subset_subset_obs, :].obs.to_df()[ - adata.obs.columns - ] - ) - .all() - .all() - ) - assert ( - ( - subsetted_subsetted_adata.var - == remote[subset_obs, :][subset_subset_obs, :].var.to_df()[ - adata.var.columns - ] - ) - .all() - .all() - ) - assert ( - subsetted_subsetted_adata.obsm["array"] - == remote[subset_obs, :][subset_subset_obs, :].obsm["array"].compute() - ).all() + assert_equal(subsetted_subsetted_adata, remote[subset_obs, :][subset_subset_obs, :].to_memory()) subset_var = (adata.var["var_cat"] == "a") | (adata.var["var_cat"] == "b") subsetted_adata = adata[:, subset_var] subset_subset_var = subsetted_adata.var["var_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] - if mtx_format == sparse.csc_matrix or mtx_format == sparse.csr_matrix: - assert np.all( - asarray(subsetted_subsetted_adata.X) - == asarray(remote[:, subset_var][:, subset_subset_var].X.to_memory()) - ) - else: - assert np.all( - asarray(subsetted_subsetted_adata.X) - == asarray(remote[:, subset_var][:, subset_subset_var].X.compute()) - ) - assert ( - ( - subsetted_subsetted_adata.obs - == remote[:, subset_var][:, subset_subset_var].obs.to_df()[ - adata.obs.columns - ] - ) - .all() - .all() - ) - assert ( - ( - subsetted_subsetted_adata.var - == remote[:, subset_var][:, subset_subset_var].var.to_df()[ - adata.var.columns - ] - ) - .all() - .all() - ) - assert ( - subsetted_subsetted_adata.obsm["array"] - == remote[:, subset_var][:, subset_subset_var].obsm["array"].compute() - ).all() + assert_equal(subsetted_subsetted_adata, remote[:, subset_var][:, subset_subset_var].to_memory()) def test_lazy_categorical_array_properties(categorical_lazy_arr): From f3b7bb31f8eb957e1868cb6d17b246a978d2bfd9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Jul 2023 14:47:28 +0200 Subject: [PATCH 113/125] (fix): refactor `view` mechanism --- .../experimental/read_backed/read_backed.py | 125 +++++++++++++----- 1 file changed, 93 insertions(+), 32 deletions(-) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index c083baa8b..775afb1b4 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -9,6 +9,7 @@ ) import h5py +import numpy as np from anndata._core.aligned_mapping import ( Layers, PairwiseArrays, @@ -52,10 +53,86 @@ def __init__( varp=None, oidx=None, vidx=None, + asview=False + ): + if asview: + if not issubclass(type(X), AbstractAnnData): + raise ValueError("`X` has to be an AnnData object.") + self._init_as_view(X, oidx, vidx) + else: + self._init_as_actual( + X=X, + obs=obs, + var=var, + uns=uns, + obsm=obsm, + varm=varm, + raw=raw, + layers=layers, + dtype=dtype, + shape=shape, + obsp=obsp, + varp=varp, + file=file, + ) + + def _init_as_view(self, adata_ref: "AnnData", oidx: Index, vidx: Index): + # Copied from non-backed class, maybe should refactor? + self._is_view = True + if isinstance(oidx, (int, np.integer)): + if not (-adata_ref.n_obs <= oidx < adata_ref.n_obs): + raise IndexError(f"Observation index `{oidx}` is out of range.") + oidx += adata_ref.n_obs * (oidx < 0) + oidx = slice(oidx, oidx + 1, 1) + if isinstance(vidx, (int, np.integer)): + if not (-adata_ref.n_vars <= vidx < adata_ref.n_vars): + raise IndexError(f"Variable index `{vidx}` is out of range.") + vidx += adata_ref.n_vars * (vidx < 0) + vidx = slice(vidx, vidx + 1, 1) + if adata_ref.is_view: + prev_oidx, prev_vidx = adata_ref._oidx, adata_ref._vidx + adata_ref = adata_ref._adata_ref + oidx, vidx = _resolve_idxs((prev_oidx, prev_vidx), (oidx, vidx), adata_ref) + + # pd.Index objects so cheap to subset + self.obs_names = adata_ref.obs_names[oidx] + self.var_names = adata_ref.var_names[vidx] + # self._adata_ref is never a view + self._adata_ref = adata_ref + self._oidx = oidx + self._vidx = vidx + # the file is the same as of the reference object + self.file = adata_ref.file + # views on attributes of adata_ref + self.obs = adata_ref.obs.isel(obs_names=oidx) + self.var = adata_ref.var.isel(var_names=vidx) + self.obsm = adata_ref.obsm._view(self, (oidx,)) + self.varm = adata_ref.varm._view(self, (vidx,)) + self.layers = adata_ref.layers._view(self, (oidx, vidx)) + self.obsp = adata_ref.obsp._view(self, oidx) + self.varp = adata_ref.varp._view(self, vidx) + # fix categories + self.uns = adata_ref.uns or OrderedDict() + self.file = adata_ref.file + self._X = adata_ref.X + + def _init_as_actual( + self, + X=None, + obs=None, + var=None, + uns=None, + obsm=None, + varm=None, + varp=None, + obsp=None, + raw=None, + layers=None, + dtype=None, + shape=None, + file=None, ): self._is_view = False - if oidx is not None and vidx is not None: # and or or? - self._is_view = True # hack needed for clean use of views below adata_ref = self # init from AnnData if issubclass(type(X), AbstractAnnData): @@ -63,16 +140,6 @@ def __init__( raise ValueError( "If `X` is a dict no further arguments must be provided." ) - if X.is_view: - prev_oidx, prev_vidx = X._oidx, X._vidx - self._oidx, self._vidx = _resolve_idxs( - (prev_oidx, prev_vidx), (oidx, vidx), X._X - ) - else: - self._oidx = oidx - self._vidx = vidx - if self._is_view: - adata_ref = X # seems to work if file is None: file = X.file X, obs, var, uns, obsm, varm, obsp, varp, layers, raw = ( @@ -87,9 +154,6 @@ def __init__( X.layers, X.raw, ) - else: - self._oidx = oidx - self._vidx = vidx if X is not None: for s_type in StorageType: @@ -111,31 +175,18 @@ def __init__( else: self._X = None - # annotations - need names already for AxisArrays to work. + # Indices are read into memory by xarray anyway, so load them here. self.file = file self.obs_names = pd.Index(obs['obs_names'].data.compute() if isinstance(obs['obs_names'].data, DaskArray) else obs['obs_names'].data) - if oidx is not None: - self.obs_names = self.obs_names[oidx] self.var_names = pd.Index(var['var_names'].data.compute() if isinstance(var['var_names'].data, DaskArray) else var['var_names'].data) - if vidx is not None: - self.var_names = self.var_names[vidx] self.obs = xr.Dataset(obs) self.var = xr.Dataset(var) - self.obsm = AxisArrays(adata_ref, 0, vals=convert_to_dict(obsm)) self.varm = AxisArrays(adata_ref, 1, vals=convert_to_dict(varm)) self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp)) self.varp = PairwiseArrays(adata_ref, 1, vals=convert_to_dict(varp)) self.layers = Layers(adata_ref, layers) - if self.is_view: - self.obs = self.obs.isel(obs_names=oidx) - self.var = self.var.isel(var_names=vidx) - self.obsm = self.obsm._view(self, oidx) - self.varm = self.varm._view(self, vidx) - self.obsp = self.obsp._view(self, oidx) - self.varp = self.varp._view(self, vidx) - self.layers = self.layers._view(self, (oidx, vidx)) self.uns = uns or OrderedDict() if not raw: @@ -157,13 +208,13 @@ def _run_checks(self): def __getitem__(self, index: Index) -> "AnnData": """Returns a sliced view of the object.""" oidx, vidx = self._normalize_indices(index) - return AnnDataBacked(self, oidx=oidx, vidx=vidx) + return AnnDataBacked(self, oidx=oidx, vidx=vidx, asview=True) def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: return _normalize_indices( index, - pd.Index(self.obs_names.compute() if isinstance(self.obs_names, DaskArray) else self.obs_names), # could be either dask or in-memory from xarray - pd.Index(self.var_names.compute() if isinstance(self.var_names, DaskArray) else self.var_names), + self.obs_names, + self.var_names, ) def to_memory(self, exclude=[]): @@ -272,6 +323,16 @@ def obsm(self): def obsm(self, obsm): self._obsm = obsm + @property + def layers(self): + if hasattr(self, "_layers"): + return self._layers + return None + + @layers.setter + def layers(self, layers): + self._layers = layers + @property def obsp(self): if hasattr(self, "_obsp"): From c64c7e629febd033523df16803dc1ea0cc52f2e6 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Wed, 19 Jul 2023 15:13:48 +0200 Subject: [PATCH 114/125] (fix): fix column handling for `to_memory` --- anndata/experimental/read_backed/read_backed.py | 17 +++++++++++------ anndata/tests/test_read_backed_experimental.py | 10 ++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 775afb1b4..94198ef71 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -241,17 +241,22 @@ def get_nullable_and_categorical_cols(ds): if isinstance(dtype, pd.CategoricalDtype) or dtype == pd.arrays.BooleanArray or dtype == pd.arrays.IntegerArray: cols += [c] return cols - def to_df(ds): + def to_df(ds, exclude_vars): nullable_and_categorical_df_cols = get_nullable_and_categorical_cols(ds) - df = ds.drop_vars(list(set(exclude + nullable_and_categorical_df_cols))).to_dataframe() + drop_vars = [k for k in set(exclude_vars + nullable_and_categorical_df_cols) if k in ds] + df = ds.drop_vars(drop_vars).to_dataframe() for c in nullable_and_categorical_df_cols: - df[c] = ds[c].data[()] + if c not in exclude_vars: + df[c] = ds[c].data[()] df.index.name = None # matches old AnnData object - df = df[list(ds.keys())] + if len(exclude_vars) == 0: + df = df[list(ds.keys())] return df - obs = to_df(self.obs) - var = to_df(self.var) + exclude_obs = [key.replace('obs/', '') for key in exclude if key.startswith('obs/')] + obs = to_df(self.obs, exclude_obs) + exclude_var = [key.replace('var/', '') for key in exclude if key.startswith('var/')] + var = to_df(self.var, exclude_var) obsm = backed_dict_to_memory(convert_to_dict(self.obsm), "obsm") varm = backed_dict_to_memory(convert_to_dict(self.varm), "varm") varp = backed_dict_to_memory(convert_to_dict(self.varp), "varp") diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index f94b0173b..0710eb704 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -287,6 +287,16 @@ def test_to_memory(tmp_path, mtx_format, dskfmt): remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) +def test_to_memory_exclude(tmp_path, mtx_format, dskfmt): + adata = gen_adata((1000, 1000), mtx_format) + base_pth = Path(tmp_path) + orig_pth = base_pth / f"orig.{dskfmt}" + write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) + write(adata) + remote = read_backed(orig_pth) + remote_to_memory = remote.to_memory(exclude=['obs/nullable-bool', 'obsm/sparse']) + assert 'nullable-bool' not in remote_to_memory.obs + assert 'sparse' not in remote_to_memory.obsm def test_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) From 853cc0bd6bac09df71ecdc1bae595953b849a0e8 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 19 Jul 2023 13:15:14 +0000 Subject: [PATCH 115/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../experimental/read_backed/lazy_arrays.py | 14 +-- .../experimental/read_backed/read_backed.py | 87 ++++++++++++------- .../tests/test_read_backed_experimental.py | 28 ++++-- 3 files changed, 87 insertions(+), 42 deletions(-) diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 8b6a40f8e..63ca098aa 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -6,18 +6,21 @@ import pandas as pd import numpy as np -from xarray.core.indexing import ExplicitlyIndexedNDArrayMixin, BasicIndexer, OuterIndexer +from xarray.core.indexing import ( + ExplicitlyIndexedNDArrayMixin, + BasicIndexer, + OuterIndexer, +) import xarray as xr class MaskedArrayMixIn(ExplicitlyIndexedNDArrayMixin): - def __eq__(self, __o) -> np.ndarray: return self[...] == __o def __ne__(self, __o) -> np.ndarray: return ~(self == __o) - + @property def shape(self) -> Tuple[int, ...]: """Shape of this array @@ -70,7 +73,7 @@ def ordered(self): def __getitem__(self, selection) -> pd.Categorical: idx = selection if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): - idx = selection.tuple[0] # need to better understand this + idx = selection.tuple[0] # need to better understand this if isinstance(self.values, ZarrArray): codes = self.values.oindex[idx] else: @@ -125,7 +128,7 @@ def dtype(self) -> pd.CategoricalDtype: def __getitem__(self, selection) -> pd.Categorical: idx = selection if isinstance(selection, BasicIndexer) or isinstance(selection, OuterIndexer): - idx = selection.tuple[0] # need to understand this better + idx = selection.tuple[0] # need to understand this better if type(idx) == int: idx = slice(idx, idx + 1) values = np.array(self.values[idx]) @@ -157,6 +160,7 @@ def copy(self) -> "LazyMaskedArray": def _subset_masked(a: xr.DataArray, subset_idx: Index): return a[subset_idx] + @as_view.register(xr.DataArray) def _view_pd_boolean_array(a: xr.DataArray, view_args): return a diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 94198ef71..8e03c2e1e 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -10,11 +10,7 @@ import h5py import numpy as np -from anndata._core.aligned_mapping import ( - Layers, - PairwiseArrays, - AxisArrays -) +from anndata._core.aligned_mapping import Layers, PairwiseArrays, AxisArrays from anndata._core.anndata import StorageType, _check_2d_shape from anndata._core.anndata_base import AbstractAnnData from anndata._core.index import Index, _normalize_indices, _subset @@ -34,6 +30,7 @@ from .. import read_dispatched from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray + class AnnDataBacked(AbstractAnnData): def __init__( self, @@ -53,7 +50,7 @@ def __init__( varp=None, oidx=None, vidx=None, - asview=False + asview=False, ): if asview: if not issubclass(type(X), AbstractAnnData): @@ -177,9 +174,17 @@ def _init_as_actual( # Indices are read into memory by xarray anyway, so load them here. self.file = file - self.obs_names = pd.Index(obs['obs_names'].data.compute() if isinstance(obs['obs_names'].data, DaskArray) else obs['obs_names'].data) - self.var_names = pd.Index(var['var_names'].data.compute() if isinstance(var['var_names'].data, DaskArray) else var['var_names'].data) - + self.obs_names = pd.Index( + obs["obs_names"].data.compute() + if isinstance(obs["obs_names"].data, DaskArray) + else obs["obs_names"].data + ) + self.var_names = pd.Index( + var["var_names"].data.compute() + if isinstance(var["var_names"].data, DaskArray) + else var["var_names"].data + ) + self.obs = xr.Dataset(obs) self.var = xr.Dataset(var) self.obsm = AxisArrays(adata_ref, 0, vals=convert_to_dict(obsm)) @@ -238,24 +243,37 @@ def get_nullable_and_categorical_cols(ds): cols = [] for c in ds: dtype = ds[c].dtype - if isinstance(dtype, pd.CategoricalDtype) or dtype == pd.arrays.BooleanArray or dtype == pd.arrays.IntegerArray: + if ( + isinstance(dtype, pd.CategoricalDtype) + or dtype == pd.arrays.BooleanArray + or dtype == pd.arrays.IntegerArray + ): cols += [c] - return cols + return cols + def to_df(ds, exclude_vars): nullable_and_categorical_df_cols = get_nullable_and_categorical_cols(ds) - drop_vars = [k for k in set(exclude_vars + nullable_and_categorical_df_cols) if k in ds] + drop_vars = [ + k + for k in set(exclude_vars + nullable_and_categorical_df_cols) + if k in ds + ] df = ds.drop_vars(drop_vars).to_dataframe() for c in nullable_and_categorical_df_cols: - if c not in exclude_vars: - df[c] = ds[c].data[()] - df.index.name = None # matches old AnnData object + if c not in exclude_vars: + df[c] = ds[c].data[()] + df.index.name = None # matches old AnnData object if len(exclude_vars) == 0: df = df[list(ds.keys())] return df - - exclude_obs = [key.replace('obs/', '') for key in exclude if key.startswith('obs/')] + + exclude_obs = [ + key.replace("obs/", "") for key in exclude if key.startswith("obs/") + ] obs = to_df(self.obs, exclude_obs) - exclude_var = [key.replace('var/', '') for key in exclude if key.startswith('var/')] + exclude_var = [ + key.replace("var/", "") for key in exclude if key.startswith("var/") + ] var = to_df(self.var, exclude_var) obsm = backed_dict_to_memory(convert_to_dict(self.obsm), "obsm") varm = backed_dict_to_memory(convert_to_dict(self.varm), "varm") @@ -443,7 +461,9 @@ def __repr__(self): return descr -def read_backed(store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]) -> AnnData: +def read_backed( + store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset] +) -> AnnData: """Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned. No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. @@ -457,10 +477,10 @@ def read_backed(store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset is_h5 = False if isinstance(store, Path) or isinstance(store, str): store = str(store) - if store.endswith('h5ad'): + if store.endswith("h5ad"): is_h5 = True - has_keys = True # true if consolidated or h5ad + has_keys = True # true if consolidated or h5ad if not is_h5: try: f = zarr.open_consolidated(store, mode="r") @@ -468,15 +488,13 @@ def read_backed(store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset has_keys = False f = zarr.open(store, mode="r") else: - f = h5py.File(store, mode="r") + f = h5py.File(store, mode="r") def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type == "anndata" or elem_name.endswith("/"): cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"] iter_object = ( - elem.items() - if has_keys - else [(k, elem[k]) for k in cols if k in elem] + elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem] ) return AnnDataBacked( **{k: read_dispatched(v, callback) for k, v in iter_object}, file=elem @@ -493,11 +511,22 @@ def callback(func, elem_name: str, elem, iospec): for k in d: v = d[k] if type(v) == DaskArray and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray(v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k) - elif (type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray) and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray(xr.core.indexing.LazilyIndexedArray(v), coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k) + d_with_xr[k] = xr.DataArray( + v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k + ) + elif ( + type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray + ) and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray( + xr.core.indexing.LazilyIndexedArray(v), + coords=[d[elem.attrs["_index"]]], + dims=[index_label], + name=k, + ) elif k == elem.attrs["_index"]: - d_with_xr[index_label] = xr.DataArray(v, coords=[v], dims=[index_label], name=index_label) + d_with_xr[index_label] = xr.DataArray( + v, coords=[v], dims=[index_label], name=index_label + ) else: d_with_xr[k] = v return d_with_xr diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 0710eb704..7207307ea 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -54,10 +54,12 @@ def mtx_format(request): def sparse_format(request): return request.param + @pytest.fixture(params=["zarr", "h5ad"]) def dskfmt(request): return request.param + @pytest.fixture() def categorical_lazy_arr(tmp_path_factory): base_path = tmp_path_factory.getbasetemp() @@ -209,7 +211,9 @@ def test_access_count_obs_var(tmp_path, mtx_format): remote.obs["int64"] remote.var["int64"] # only the `cat` should be read in - subset = remote[(remote.obs["cat"] == "a").data, :] # `.data` for xarray, but should we handle internally? + subset = remote[ + (remote.obs["cat"] == "a").data, : + ] # `.data` for xarray, but should we handle internally? subset.obs["int64"] sub_subset = subset[0:10, :] sub_subset.obs["int64"] @@ -287,6 +291,7 @@ def test_to_memory(tmp_path, mtx_format, dskfmt): remote_to_memory = remote.to_memory() assert_equal(remote_to_memory, adata) + def test_to_memory_exclude(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) base_pth = Path(tmp_path) @@ -294,9 +299,10 @@ def test_to_memory_exclude(tmp_path, mtx_format, dskfmt): write = lambda x: getattr(x, f"write_{dskfmt}")(orig_pth) write(adata) remote = read_backed(orig_pth) - remote_to_memory = remote.to_memory(exclude=['obs/nullable-bool', 'obsm/sparse']) - assert 'nullable-bool' not in remote_to_memory.obs - assert 'sparse' not in remote_to_memory.obsm + remote_to_memory = remote.to_memory(exclude=["obs/nullable-bool", "obsm/sparse"]) + assert "nullable-bool" not in remote_to_memory.obs + assert "sparse" not in remote_to_memory.obsm + def test_view_to_memory(tmp_path, mtx_format, dskfmt): adata = gen_adata((1000, 1000), mtx_format) @@ -323,13 +329,19 @@ def test_view_of_view_to_memory(tmp_path, mtx_format, dskfmt): subsetted_adata = adata[subset_obs, :] subset_subset_obs = subsetted_adata.obs["obs_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[subset_subset_obs, :] - assert_equal(subsetted_subsetted_adata, remote[subset_obs, :][subset_subset_obs, :].to_memory()) + assert_equal( + subsetted_subsetted_adata, + remote[subset_obs, :][subset_subset_obs, :].to_memory(), + ) subset_var = (adata.var["var_cat"] == "a") | (adata.var["var_cat"] == "b") subsetted_adata = adata[:, subset_var] subset_subset_var = subsetted_adata.var["var_cat"] == "b" subsetted_subsetted_adata = subsetted_adata[:, subset_subset_var] - assert_equal(subsetted_subsetted_adata, remote[:, subset_var][:, subset_subset_var].to_memory()) + assert_equal( + subsetted_subsetted_adata, + remote[:, subset_var][:, subset_subset_var].to_memory(), + ) def test_lazy_categorical_array_properties(categorical_lazy_arr): @@ -386,8 +398,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert nullable_boolean_lazy_arr_no_mask[0] == True - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert nullable_boolean_lazy_arr_no_mask[0] is True + assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 703812c92dfbeac6f372bcec09b9a2b60ddd34c5 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Sun, 30 Jul 2023 21:45:33 +0200 Subject: [PATCH 116/125] (feat): `obsm`/`varm` `xr.Dataset` --- .../experimental/read_backed/lazy_arrays.py | 12 +++-- .../experimental/read_backed/read_backed.py | 50 ++++++++++--------- anndata/experimental/read_backed/xarray.py | 24 +++++++++ .../tests/test_read_backed_experimental.py | 6 +-- 4 files changed, 62 insertions(+), 30 deletions(-) create mode 100644 anndata/experimental/read_backed/xarray.py diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 63ca098aa..931f99933 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -38,20 +38,23 @@ class LazyCategoricalArray(MaskedArrayMixIn): "_categories", "_categories_cache", "group", + "_drop_unused_cats" ) - def __init__(self, codes, categories, attrs, *args, **kwargs): + def __init__(self, codes, categories, attrs, _drop_unused_cats, *args, **kwargs): """Class for lazily reading categorical data from formatted zarr group. Used as base for `LazilyIndexedArray`. Args: codes (Union[zarr.Array, h5py.Dataset]): values (integers) of the array, one for each element categories (Union[zarr.Array, h5py.Dataset]): mappings from values to strings attrs (Union[zarr.Array, h5py.Dataset]): attrs containing boolean "ordered" + _drop_unused_cats (bool): Whether or not to drop unused categories. """ self.values = codes self._categories = categories self._categories_cache = None self.attrs = dict(attrs) + self._drop_unused_cats = _drop_unused_cats # obsm/varm do not drop, but obs and var do. TODO: Should fix in normal AnnData? @property def categories(self): # __slots__ and cached_property are incompatible @@ -80,11 +83,14 @@ def __getitem__(self, selection) -> pd.Categorical: codes = self.values[idx] if codes.shape == (): # handle 0d case codes = np.array([codes]) - return pd.Categorical.from_codes( + res = pd.Categorical.from_codes( codes=codes, categories=self.categories, ordered=self.ordered, - ).remove_unused_categories() + ) + if self._drop_unused_cats: + return res.remove_unused_categories() + return res def __repr__(self) -> str: return f"LazyCategoricalArray(codes=..., categories={self.categories}, ordered={self.ordered})" diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 8e03c2e1e..4aeae17eb 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -19,16 +19,16 @@ from anndata._core.views import _resolve_idxs from anndata._io.h5ad import read_dataset from anndata.compat import DaskArray -from anndata.utils import asarray, convert_to_dict +from anndata.utils import convert_to_dict import zarr import xarray as xr import pandas as pd import dask.array as da -from scipy import sparse from ..._core import AnnData from .. import read_dispatched from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray +from .xarray import Dataset2D class AnnDataBacked(AbstractAnnData): @@ -185,8 +185,8 @@ def _init_as_actual( else var["var_names"].data ) - self.obs = xr.Dataset(obs) - self.var = xr.Dataset(var) + self.obs = obs + self.var = var self.obsm = AxisArrays(adata_ref, 0, vals=convert_to_dict(obsm)) self.varm = AxisArrays(adata_ref, 1, vals=convert_to_dict(varm)) self.obsp = PairwiseArrays(adata_ref, 0, vals=convert_to_dict(obsp)) @@ -223,20 +223,6 @@ def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: ) def to_memory(self, exclude=[]): - # handling for AxisArrays - def backed_dict_to_memory(d, prefix): - res = {} - for k, v in d.items(): - full_key = prefix + "/" + k - if any([full_key == exclude_key for exclude_key in exclude]): - continue - if isinstance(v, DaskArray): - res[k] = v.compute() - elif isinstance(v, BaseCompressedSparseDataset): - res[k] = v.to_memory() - else: - res[k] = v - return res # nullable and categoricals need special handling because xarray will convert them to numpy arrays first with dtype object def get_nullable_and_categorical_cols(ds): @@ -251,7 +237,7 @@ def get_nullable_and_categorical_cols(ds): cols += [c] return cols - def to_df(ds, exclude_vars): + def to_df(ds, exclude_vars=[]): nullable_and_categorical_df_cols = get_nullable_and_categorical_cols(ds) drop_vars = [ k @@ -266,6 +252,23 @@ def to_df(ds, exclude_vars): if len(exclude_vars) == 0: df = df[list(ds.keys())] return df + + # handling for AxisArrays + def backed_dict_to_memory(d, prefix): + res = {} + for k, v in d.items(): + full_key = prefix + "/" + k + if any([full_key == exclude_key for exclude_key in exclude]): + continue + if isinstance(v, DaskArray): + res[k] = v.compute() + elif isinstance(v, BaseCompressedSparseDataset): + res[k] = v.to_memory() + elif isinstance(v, Dataset2D): + res[k] = to_df(v) + else: + res[k] = v + return res exclude_obs = [ key.replace("obs/", "") for key in exclude if key.startswith("obs/") @@ -501,7 +504,7 @@ def callback(func, elem_name: str, elem, iospec): ) elif elem_name.startswith("/raw"): return None - elif elem_name in {"/obs", "/var"}: + elif iospec.encoding_type in {"dataframe"}: iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ (elem.attrs["_index"], elem[elem.attrs["_index"]]) ] @@ -529,9 +532,10 @@ def callback(func, elem_name: str, elem, iospec): ) else: d_with_xr[k] = v - return d_with_xr + return Dataset2D(d_with_xr) elif iospec.encoding_type == "categorical": - return LazyCategoricalArray(elem["codes"], elem["categories"], elem.attrs) + drop_unused_cats = not (elem_name.startswith('/obsm') or elem_name.startswith('/varm')) + return LazyCategoricalArray(elem["codes"], elem["categories"], elem.attrs, drop_unused_cats) elif "nullable" in iospec.encoding_type: return LazyMaskedArray( elem["values"], @@ -550,8 +554,6 @@ def callback(func, elem_name: str, elem, iospec): return sparse_dataset(elem) elif iospec.encoding_type in {"awkward-array"}: return read_dispatched(elem, None) - elif iospec.encoding_type in {"dataframe"}: - return read_dispatched(elem, None) return func(elem) adata = read_dispatched(f, callback=callback) diff --git a/anndata/experimental/read_backed/xarray.py b/anndata/experimental/read_backed/xarray.py new file mode 100644 index 000000000..0f869759f --- /dev/null +++ b/anndata/experimental/read_backed/xarray.py @@ -0,0 +1,24 @@ +import xarray as xr +from anndata._core.index import Index, _subset +from anndata._core.views import as_view + +def get_index_dim(ds): + assert len(ds.dims) == 1, f"xarray Dataset should not have more than 1 dims, found {len(ds)}" + return list(ds.dims.keys())[0] + +class Dataset2D(xr.Dataset): + + @property + def shape(self): # aligned mapping classes look for this for DataFrames so this ensures usability with e.g., obsm + return [self.dims[get_index_dim(self)], len(self)] + +@_subset.register(Dataset2D) +def _(a: xr.DataArray, subset_idx: Index): + key = get_index_dim(a) + if isinstance(subset_idx, tuple) and len(subset_idx) == 1: # xarray seems to have some code looking for a second entry in tuples + return a.isel(**{ key:subset_idx[0] }) + return a.isel(**{ key:subset_idx }) + +@as_view.register(Dataset2D) +def _(a: Dataset2D, view_args): + return a \ No newline at end of file diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 7207307ea..65859e910 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -68,7 +68,7 @@ def categorical_lazy_arr(tmp_path_factory): z["categories"] = np.array(["foo", "bar", "jazz"]) z.attrs["ordered"] = False z = zarr.open(base_path) - return LazyCategoricalArray(z["codes"], z["categories"], z.attrs) + return LazyCategoricalArray(z["codes"], z["categories"], z.attrs, True) @pytest.fixture() @@ -398,8 +398,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert nullable_boolean_lazy_arr_no_mask[0] is True - assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() + assert nullable_boolean_lazy_arr_no_mask[0] == True + assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 8ce994b6f607a687f6e0083035d778911cc42d3d Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Sun, 30 Jul 2023 21:45:48 +0200 Subject: [PATCH 117/125] (chore): refactor `ZarrArray` `subset` function --- anndata/_core/index.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 92c4eb07d..fe3645b04 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -116,15 +116,18 @@ def unpack_index(index: Index) -> Tuple[Index1D, Index1D]: @singledispatch -def _subset(a: Union[np.ndarray, pd.DataFrame], subset_idx: Index): +def _subset(a: np.ndarray, subset_idx: Index): # Select as combination of indexes, not coordinates # Correcting for indexing behaviour of np.ndarray if all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) - if isinstance(a, ZarrArray): - return a.oindex[subset_idx] return a[subset_idx] +@_subset.register(ZarrArray) +def _subset_zarr(a: ZarrArray, subset_idx: Index): + if all(isinstance(x, cabc.Iterable) for x in subset_idx): + subset_idx = np.ix_(*subset_idx) + return a.oindex[subset_idx] @_subset.register(DaskArray) def _subset_dask(a: DaskArray, subset_idx: Index): From 6fe7016e4f6d73a915afc21bb8c25bb0879eba65 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Aug 2023 09:42:38 +0200 Subject: [PATCH 118/125] (fix): `backed` for experimental `merge.py` --- anndata/experimental/merge.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/anndata/experimental/merge.py b/anndata/experimental/merge.py index 386d06a40..14115f782 100644 --- a/anndata/experimental/merge.py +++ b/anndata/experimental/merge.py @@ -34,7 +34,7 @@ resolve_merge_strategy, unify_dtypes, ) -from .._core.sparse_dataset import SparseDataset +from .._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset from .._io.specs import read_elem, write_elem from ..compat import H5Array, H5Group, ZarrArray, ZarrGroup from . import read_dispatched @@ -66,30 +66,31 @@ def _indices_equal(indices: Iterable[pd.Index]) -> bool: def _gen_slice_to_append( - datasets: Sequence[SparseDataset], + datasets: Sequence[BaseCompressedSparseDataset], reindexers, max_loaded_elems: int, axis=0, fill_value=None, ): for ds, ri in zip(datasets, reindexers): - n_slices = ds.shape[axis] * ds.shape[1 - axis] // max_loaded_elems + backed = ds.to_backed() + n_slices = backed.shape[axis] * backed.shape[1 - axis] // max_loaded_elems if n_slices < 2: yield (csr_matrix, csc_matrix)[axis]( ri(to_memory(ds), axis=1 - axis, fill_value=fill_value) ) else: - slice_size = max_loaded_elems // ds.shape[1 - axis] + slice_size = max_loaded_elems // backed.shape[1 - axis] if slice_size == 0: slice_size = 1 - rem_slices = ds.shape[axis] + rem_slices = backed.shape[axis] idx = 0 while rem_slices > 0: ds_part = None if axis == 0: - ds_part = ds[idx : idx + slice_size, :] + ds_part = backed[idx : idx + slice_size, :] elif axis == 1: - ds_part = ds[:, idx : idx + slice_size] + ds_part = backed[:, idx : idx + slice_size] yield (csr_matrix, csc_matrix)[axis]( ri(ds_part, axis=1 - axis, fill_value=fill_value) @@ -138,12 +139,12 @@ def _(store, *args, **kwargs): def read_as_backed(group: Union[ZarrGroup, H5Group]): """ Read the group until - SparseDataset, Array or EAGER_TYPES are encountered. + BaseCompressedSparseDataset, Array or EAGER_TYPES are encountered. """ def callback(func, elem_name: str, elem, iospec): if iospec.encoding_type in SPARSE_MATRIX: - return SparseDataset(elem) + return sparse_dataset(elem) elif iospec.encoding_type in EAGER_TYPES: return read_elem(elem) elif iospec.encoding_type == "array": @@ -195,7 +196,7 @@ def write_concat_dense( def write_concat_sparse( - datasets: Sequence[SparseDataset], + datasets: Sequence[BaseCompressedSparseDataset], output_group: Union[ZarrGroup, H5Group], output_path: Union[ZarrGroup, H5Group], max_loaded_elems: int, @@ -207,7 +208,7 @@ def write_concat_sparse( Writes and concatenates sparse datasets into a single output dataset. Args: - datasets (Sequence[SparseDataset]): A sequence of SparseDataset objects to be concatenated. + datasets (Sequence[BaseCompressedSparseDataset]): A sequence of BaseCompressedSparseDataset objects to be concatenated. output_group (Union[ZarrGroup, H5Group]): The output group where the concatenated dataset will be written. output_path (Union[ZarrGroup, H5Group]): The output path where the concatenated dataset will be written. max_loaded_elems (int): The maximum number of sparse elements to load at once. @@ -227,7 +228,7 @@ def write_concat_sparse( init_elem = next(elems) write_elem(output_group, output_path, init_elem) del init_elem - out_dataset: SparseDataset = read_as_backed(output_group[output_path]) + out_dataset: BaseCompressedSparseDataset = read_as_backed(output_group[output_path]) for temp_elem in elems: out_dataset.append(temp_elem) del temp_elem @@ -269,7 +270,7 @@ def _write_concat_mappings( def _write_concat_arrays( - arrays: Sequence[Union[ZarrArray, H5Array, SparseDataset]], + arrays: Sequence[Union[ZarrArray, H5Array, BaseCompressedSparseDataset]], output_group, output_path, max_loaded_elems, @@ -291,7 +292,7 @@ def _write_concat_arrays( else: raise NotImplementedError("Cannot reindex arrays with outer join.") - if isinstance(init_elem, SparseDataset): + if isinstance(init_elem, BaseCompressedSparseDataset): expected_sparse_fmt = ["csr", "csc"][axis] if all(a.format_str == expected_sparse_fmt for a in arrays): write_concat_sparse( @@ -314,7 +315,7 @@ def _write_concat_arrays( def _write_concat_sequence( - arrays: Sequence[Union[pd.DataFrame, SparseDataset, H5Array, ZarrArray]], + arrays: Sequence[Union[pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray]], output_group, output_path, max_loaded_elems, @@ -349,7 +350,7 @@ def _write_concat_sequence( ) write_elem(output_group, output_path, df) elif all( - isinstance(a, (pd.DataFrame, SparseDataset, H5Array, ZarrArray)) for a in arrays + isinstance(a, (pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray)) for a in arrays ): _write_concat_arrays( arrays, From c3f69358ac4e9562200c709101122d93812aa90e Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 1 Aug 2023 09:42:55 +0200 Subject: [PATCH 119/125] (fix): `pyproject.toml` missing comma --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 8db726602..47f9eb8a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,7 +60,7 @@ dev = [ # static checking "black>=20.8b1", "docutils", - "xarray>=2023.1.0" + "xarray>=2023.1.0", # test speedups "pytest-xdist", ] From deced7cc7b4eac6f5df11b5c21fba9c850e5f180 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 1 Aug 2023 07:50:12 +0000 Subject: [PATCH 120/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/index.py | 2 ++ anndata/experimental/merge.py | 7 ++++-- .../experimental/read_backed/lazy_arrays.py | 4 +-- .../experimental/read_backed/read_backed.py | 11 +++++--- anndata/experimental/read_backed/xarray.py | 25 +++++++++++++------ .../tests/test_read_backed_experimental.py | 4 +-- 6 files changed, 35 insertions(+), 18 deletions(-) diff --git a/anndata/_core/index.py b/anndata/_core/index.py index fe3645b04..f815b565e 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -123,12 +123,14 @@ def _subset(a: np.ndarray, subset_idx: Index): subset_idx = np.ix_(*subset_idx) return a[subset_idx] + @_subset.register(ZarrArray) def _subset_zarr(a: ZarrArray, subset_idx: Index): if all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) return a.oindex[subset_idx] + @_subset.register(DaskArray) def _subset_dask(a: DaskArray, subset_idx: Index): if isinstance(subset_idx, slice): diff --git a/anndata/experimental/merge.py b/anndata/experimental/merge.py index 14115f782..966bca7df 100644 --- a/anndata/experimental/merge.py +++ b/anndata/experimental/merge.py @@ -315,7 +315,9 @@ def _write_concat_arrays( def _write_concat_sequence( - arrays: Sequence[Union[pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray]], + arrays: Sequence[ + Union[pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray] + ], output_group, output_path, max_loaded_elems, @@ -350,7 +352,8 @@ def _write_concat_sequence( ) write_elem(output_group, output_path, df) elif all( - isinstance(a, (pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray)) for a in arrays + isinstance(a, (pd.DataFrame, BaseCompressedSparseDataset, H5Array, ZarrArray)) + for a in arrays ): _write_concat_arrays( arrays, diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index 931f99933..fe210b7ac 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -38,7 +38,7 @@ class LazyCategoricalArray(MaskedArrayMixIn): "_categories", "_categories_cache", "group", - "_drop_unused_cats" + "_drop_unused_cats", ) def __init__(self, codes, categories, attrs, _drop_unused_cats, *args, **kwargs): @@ -54,7 +54,7 @@ def __init__(self, codes, categories, attrs, _drop_unused_cats, *args, **kwargs) self._categories = categories self._categories_cache = None self.attrs = dict(attrs) - self._drop_unused_cats = _drop_unused_cats # obsm/varm do not drop, but obs and var do. TODO: Should fix in normal AnnData? + self._drop_unused_cats = _drop_unused_cats # obsm/varm do not drop, but obs and var do. TODO: Should fix in normal AnnData? @property def categories(self): # __slots__ and cached_property are incompatible diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/read_backed/read_backed.py index 4aeae17eb..78edab99d 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/read_backed/read_backed.py @@ -223,7 +223,6 @@ def _normalize_indices(self, index: Optional[Index]) -> Tuple[slice, slice]: ) def to_memory(self, exclude=[]): - # nullable and categoricals need special handling because xarray will convert them to numpy arrays first with dtype object def get_nullable_and_categorical_cols(ds): cols = [] @@ -252,7 +251,7 @@ def to_df(ds, exclude_vars=[]): if len(exclude_vars) == 0: df = df[list(ds.keys())] return df - + # handling for AxisArrays def backed_dict_to_memory(d, prefix): res = {} @@ -534,8 +533,12 @@ def callback(func, elem_name: str, elem, iospec): d_with_xr[k] = v return Dataset2D(d_with_xr) elif iospec.encoding_type == "categorical": - drop_unused_cats = not (elem_name.startswith('/obsm') or elem_name.startswith('/varm')) - return LazyCategoricalArray(elem["codes"], elem["categories"], elem.attrs, drop_unused_cats) + drop_unused_cats = not ( + elem_name.startswith("/obsm") or elem_name.startswith("/varm") + ) + return LazyCategoricalArray( + elem["codes"], elem["categories"], elem.attrs, drop_unused_cats + ) elif "nullable" in iospec.encoding_type: return LazyMaskedArray( elem["values"], diff --git a/anndata/experimental/read_backed/xarray.py b/anndata/experimental/read_backed/xarray.py index 0f869759f..7fe936af3 100644 --- a/anndata/experimental/read_backed/xarray.py +++ b/anndata/experimental/read_backed/xarray.py @@ -2,23 +2,32 @@ from anndata._core.index import Index, _subset from anndata._core.views import as_view + def get_index_dim(ds): - assert len(ds.dims) == 1, f"xarray Dataset should not have more than 1 dims, found {len(ds)}" + assert ( + len(ds.dims) == 1 + ), f"xarray Dataset should not have more than 1 dims, found {len(ds)}" return list(ds.dims.keys())[0] -class Dataset2D(xr.Dataset): +class Dataset2D(xr.Dataset): @property - def shape(self): # aligned mapping classes look for this for DataFrames so this ensures usability with e.g., obsm + def shape( + self, + ): # aligned mapping classes look for this for DataFrames so this ensures usability with e.g., obsm return [self.dims[get_index_dim(self)], len(self)] - + + @_subset.register(Dataset2D) def _(a: xr.DataArray, subset_idx: Index): key = get_index_dim(a) - if isinstance(subset_idx, tuple) and len(subset_idx) == 1: # xarray seems to have some code looking for a second entry in tuples - return a.isel(**{ key:subset_idx[0] }) - return a.isel(**{ key:subset_idx }) + if ( + isinstance(subset_idx, tuple) and len(subset_idx) == 1 + ): # xarray seems to have some code looking for a second entry in tuples + return a.isel(**{key: subset_idx[0]}) + return a.isel(**{key: subset_idx}) + @as_view.register(Dataset2D) def _(a: Dataset2D, view_args): - return a \ No newline at end of file + return a diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 65859e910..39bcdc754 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -398,8 +398,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert nullable_boolean_lazy_arr_no_mask[0] == True - assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() + assert nullable_boolean_lazy_arr_no_mask[0] is True + assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 48a134ba8e279f0bf1dab08e8d38643ea4b210c7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 8 Aug 2023 09:32:08 +0000 Subject: [PATCH 121/125] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/_core/__init__.py | 2 -- anndata/_core/sparse_dataset.py | 1 - anndata/experimental/read_backed/__init__.py | 2 -- anndata/experimental/read_backed/lazy_arrays.py | 2 +- anndata/tests/test_backed_sparse.py | 1 - anndata/tests/test_read_backed_experimental.py | 1 - 6 files changed, 1 insertion(+), 8 deletions(-) diff --git a/anndata/_core/__init__.py b/anndata/_core/__init__.py index 1fe157178..e69de29bb 100644 --- a/anndata/_core/__init__.py +++ b/anndata/_core/__init__.py @@ -1,2 +0,0 @@ -from .anndata import AnnData -from .aligned_mapping import AxisArrays diff --git a/anndata/_core/sparse_dataset.py b/anndata/_core/sparse_dataset.py index 7291ecbe1..5d0ef76d9 100644 --- a/anndata/_core/sparse_dataset.py +++ b/anndata/_core/sparse_dataset.py @@ -15,7 +15,6 @@ from itertools import accumulate, chain from pathlib import Path from typing import Union, NamedTuple, Tuple, Sequence, Iterable, Type -from warnings import warn import h5py import zarr diff --git a/anndata/experimental/read_backed/__init__.py b/anndata/experimental/read_backed/__init__.py index 563c67aa9..e69de29bb 100644 --- a/anndata/experimental/read_backed/__init__.py +++ b/anndata/experimental/read_backed/__init__.py @@ -1,2 +0,0 @@ -from .read_backed import read_backed -from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/read_backed/lazy_arrays.py index fe210b7ac..df02b4b76 100644 --- a/anndata/experimental/read_backed/lazy_arrays.py +++ b/anndata/experimental/read_backed/lazy_arrays.py @@ -1,6 +1,6 @@ from typing import Tuple from anndata._core.index import Index, _subset -from anndata._core.views import _resolve_idx, as_view +from anndata._core.views import as_view from anndata._io.h5ad import read_dataset from anndata.compat import ZarrArray diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index 83863fcc4..6ae4a1b5c 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -6,7 +6,6 @@ import anndata as ad from anndata._core.anndata import AnnData from anndata._core.sparse_dataset import sparse_dataset -from anndata._io.zarr import read_dataframe from anndata.tests.helpers import assert_equal, subset_func from anndata.experimental import read_dispatched diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 39bcdc754..eaa8bbc36 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -18,7 +18,6 @@ LazyCategoricalArray, LazyMaskedArray, ) -from anndata.utils import asarray from zarr import DirectoryStore From 2806a9fef327fddeda9ec359ec9ad66e703909a9 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 11:49:52 +0200 Subject: [PATCH 122/125] (chore): remove pre-commit deps --- pyproject.toml | 3 --- 1 file changed, 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 47f9eb8a0..50df3959a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,9 +57,6 @@ Home-page = "https://github.com/scverse/anndata" dev = [ # dev version generation "setuptools_scm", - # static checking - "black>=20.8b1", - "docutils", "xarray>=2023.1.0", # test speedups "pytest-xdist", From 516f98410f8f5c67b4542ccd59421a2ecc345511 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 12:03:24 +0200 Subject: [PATCH 123/125] (fix): don't let ruff change `==` for `DataFrame` to `is` --- anndata/tests/test_read_backed_experimental.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index eaa8bbc36..6ac431bb2 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -397,8 +397,8 @@ def test_nullable_boolean_array_subset_subset(nullable_boolean_lazy_arr): def test_nullable_boolean_array_no_mask_equality(nullable_boolean_lazy_arr_no_mask): - assert nullable_boolean_lazy_arr_no_mask[0] is True - assert (nullable_boolean_lazy_arr_no_mask[3:5] is False).all() + assert nullable_boolean_lazy_arr_no_mask[0] == True # noqa + assert (nullable_boolean_lazy_arr_no_mask[3:5] == False).all() # noqa assert (nullable_boolean_lazy_arr_no_mask[5:7] == np.array([True, False])).all() From 60b0ae660d9e5064366f8526ea5afe020c2f2828 Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 12:04:49 +0200 Subject: [PATCH 124/125] (chore): move `xarray` to `test` deps --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a052c5974..e2b12169f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,6 @@ Home-page = "https://github.com/scverse/anndata" dev = [ # dev version generation "setuptools_scm", - "xarray>=2023.1.0", # test speedups "pytest-xdist", ] @@ -89,6 +88,7 @@ test = [ "dask[array]", "awkward>=2.3", "pytest_memray", + "xarray>=2023.1.0", ] gpu = [ "cupy", From 9d53307efc00489779f77528d9d7ab315854d5ea Mon Sep 17 00:00:00 2001 From: ilan-gold Date: Tue, 8 Aug 2023 13:42:30 +0200 Subject: [PATCH 125/125] (style): change folder structure --- anndata/experimental/__init__.py | 2 + .../{read_backed => backed}/__init__.py | 0 .../_anndata_backed.py} | 120 +----------------- anndata/experimental/backed/_io.py | 119 +++++++++++++++++ .../lazy_arrays.py => backed/_lazy_arrays.py} | 0 .../xarray.py => backed/_xarray.py} | 0 .../tests/test_read_backed_experimental.py | 4 +- 7 files changed, 127 insertions(+), 118 deletions(-) rename anndata/experimental/{read_backed => backed}/__init__.py (100%) rename anndata/experimental/{read_backed/read_backed.py => backed/_anndata_backed.py} (74%) create mode 100644 anndata/experimental/backed/_io.py rename anndata/experimental/{read_backed/lazy_arrays.py => backed/_lazy_arrays.py} (100%) rename anndata/experimental/{read_backed/xarray.py => backed/_xarray.py} (100%) diff --git a/anndata/experimental/__init__.py b/anndata/experimental/__init__.py index b16d2cd32..bd7548de4 100644 --- a/anndata/experimental/__init__.py +++ b/anndata/experimental/__init__.py @@ -6,6 +6,7 @@ from anndata._io.specs import read_elem, write_elem, IOSpec from ._dispatch_io import read_dispatched, write_dispatched from .merge import concat_on_disk +from .backed._io import read_backed __all__ = [ "AnnCollection", @@ -16,4 +17,5 @@ "write_dispatched", "IOSpec", "concat_on_disk", + "read_backed", ] diff --git a/anndata/experimental/read_backed/__init__.py b/anndata/experimental/backed/__init__.py similarity index 100% rename from anndata/experimental/read_backed/__init__.py rename to anndata/experimental/backed/__init__.py diff --git a/anndata/experimental/read_backed/read_backed.py b/anndata/experimental/backed/_anndata_backed.py similarity index 74% rename from anndata/experimental/read_backed/read_backed.py rename to anndata/experimental/backed/_anndata_backed.py index 78edab99d..a1e1a02ed 100644 --- a/anndata/experimental/read_backed/read_backed.py +++ b/anndata/experimental/backed/_anndata_backed.py @@ -1,34 +1,23 @@ from collections import OrderedDict, abc as cabc -from pathlib import Path from typing import ( - MutableMapping, Optional, - Union, Sequence, Tuple, ) -import h5py import numpy as np from anndata._core.aligned_mapping import Layers, PairwiseArrays, AxisArrays -from anndata._core.anndata import StorageType, _check_2d_shape +from anndata._core.anndata import StorageType, _check_2d_shape, AnnData from anndata._core.anndata_base import AbstractAnnData from anndata._core.index import Index, _normalize_indices, _subset from anndata._core.raw import Raw -from anndata._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset +from anndata._core.sparse_dataset import BaseCompressedSparseDataset from anndata._core.views import _resolve_idxs -from anndata._io.h5ad import read_dataset from anndata.compat import DaskArray from anndata.utils import convert_to_dict - -import zarr -import xarray as xr import pandas as pd -import dask.array as da -from ..._core import AnnData -from .. import read_dispatched -from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray -from .xarray import Dataset2D + +from ._xarray import Dataset2D class AnnDataBacked(AbstractAnnData): @@ -461,104 +450,3 @@ def __repr__(self): if len(keys) > 0: descr += f"\n {attr}: {str(list(keys))[1:-1]}" return descr - - -def read_backed( - store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset] -) -> AnnData: - """Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned. - No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. - - Args: - store (Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]): A store-like object to be read in. If `zarr`, it is best - for it to be consolidated. - - Returns: - AnnData: A lazily read-in AnnData object. - """ - is_h5 = False - if isinstance(store, Path) or isinstance(store, str): - store = str(store) - if store.endswith("h5ad"): - is_h5 = True - - has_keys = True # true if consolidated or h5ad - if not is_h5: - try: - f = zarr.open_consolidated(store, mode="r") - except KeyError: - has_keys = False - f = zarr.open(store, mode="r") - else: - f = h5py.File(store, mode="r") - - def callback(func, elem_name: str, elem, iospec): - if iospec.encoding_type == "anndata" or elem_name.endswith("/"): - cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"] - iter_object = ( - elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem] - ) - return AnnDataBacked( - **{k: read_dispatched(v, callback) for k, v in iter_object}, file=elem - ) - elif elem_name.startswith("/raw"): - return None - elif iospec.encoding_type in {"dataframe"}: - iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ - (elem.attrs["_index"], elem[elem.attrs["_index"]]) - ] - d = {k: read_dispatched(v, callback) for k, v in iter_object} - d_with_xr = {} - index_label = f'{elem_name.replace("/", "")}_names' - for k in d: - v = d[k] - if type(v) == DaskArray and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray( - v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k - ) - elif ( - type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray - ) and k != elem.attrs["_index"]: - d_with_xr[k] = xr.DataArray( - xr.core.indexing.LazilyIndexedArray(v), - coords=[d[elem.attrs["_index"]]], - dims=[index_label], - name=k, - ) - elif k == elem.attrs["_index"]: - d_with_xr[index_label] = xr.DataArray( - v, coords=[v], dims=[index_label], name=index_label - ) - else: - d_with_xr[k] = v - return Dataset2D(d_with_xr) - elif iospec.encoding_type == "categorical": - drop_unused_cats = not ( - elem_name.startswith("/obsm") or elem_name.startswith("/varm") - ) - return LazyCategoricalArray( - elem["codes"], elem["categories"], elem.attrs, drop_unused_cats - ) - elif "nullable" in iospec.encoding_type: - return LazyMaskedArray( - elem["values"], - elem["mask"] if "mask" in elem else None, - iospec.encoding_type, - ) - elif iospec.encoding_type in {"array", "string-array"}: - if is_h5: - if iospec.encoding_type == "string-array": - elem = read_dataset(elem) - if not hasattr(elem, "chunks") or elem.chunks is None: - return da.from_array(elem, chunks=(1000,) * len(elem.shape)) - return da.from_array(elem) - return da.from_zarr(elem) - elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: - return sparse_dataset(elem) - elif iospec.encoding_type in {"awkward-array"}: - return read_dispatched(elem, None) - return func(elem) - - adata = read_dispatched(f, callback=callback) - - return adata diff --git a/anndata/experimental/backed/_io.py b/anndata/experimental/backed/_io.py new file mode 100644 index 000000000..a2708be88 --- /dev/null +++ b/anndata/experimental/backed/_io.py @@ -0,0 +1,119 @@ +from pathlib import Path +from typing import ( + MutableMapping, + Union, +) + +from anndata._core.sparse_dataset import sparse_dataset +from anndata._io.h5ad import read_dataset +from anndata.compat import DaskArray +import h5py +import zarr +import xarray as xr +import dask.array as da + +from .. import read_dispatched +from ._lazy_arrays import LazyCategoricalArray, LazyMaskedArray +from ._xarray import Dataset2D +from ._anndata_backed import AnnDataBacked + + +def read_backed( + store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset] +) -> AnnDataBacked: + """Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned. + No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays. + + Args: + store (Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]): A store-like object to be read in. If `zarr`, it is best + for it to be consolidated. + + Returns: + AnnDataBacked: A lazily read-in AnnData object. + """ + is_h5 = False + if isinstance(store, Path) or isinstance(store, str): + store = str(store) + if store.endswith("h5ad"): + is_h5 = True + + has_keys = True # true if consolidated or h5ad + if not is_h5: + try: + f = zarr.open_consolidated(store, mode="r") + except KeyError: + has_keys = False + f = zarr.open(store, mode="r") + else: + f = h5py.File(store, mode="r") + + def callback(func, elem_name: str, elem, iospec): + if iospec.encoding_type == "anndata" or elem_name.endswith("/"): + cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"] + iter_object = ( + elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem] + ) + return AnnDataBacked( + **{k: read_dispatched(v, callback) for k, v in iter_object}, file=elem + ) + elif elem_name.startswith("/raw"): + return None + elif iospec.encoding_type in {"dataframe"}: + iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [ + (elem.attrs["_index"], elem[elem.attrs["_index"]]) + ] + d = {k: read_dispatched(v, callback) for k, v in iter_object} + d_with_xr = {} + index_label = f'{elem_name.replace("/", "")}_names' + for k in d: + v = d[k] + if type(v) == DaskArray and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray( + v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k + ) + elif ( + type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray + ) and k != elem.attrs["_index"]: + d_with_xr[k] = xr.DataArray( + xr.core.indexing.LazilyIndexedArray(v), + coords=[d[elem.attrs["_index"]]], + dims=[index_label], + name=k, + ) + elif k == elem.attrs["_index"]: + d_with_xr[index_label] = xr.DataArray( + v, coords=[v], dims=[index_label], name=index_label + ) + else: + d_with_xr[k] = v + return Dataset2D(d_with_xr) + elif iospec.encoding_type == "categorical": + drop_unused_cats = not ( + elem_name.startswith("/obsm") or elem_name.startswith("/varm") + ) + return LazyCategoricalArray( + elem["codes"], elem["categories"], elem.attrs, drop_unused_cats + ) + elif "nullable" in iospec.encoding_type: + return LazyMaskedArray( + elem["values"], + elem["mask"] if "mask" in elem else None, + iospec.encoding_type, + ) + elif iospec.encoding_type in {"array", "string-array"}: + if is_h5: + if iospec.encoding_type == "string-array": + elem = read_dataset(elem) + if not hasattr(elem, "chunks") or elem.chunks is None: + return da.from_array(elem, chunks=(1000,) * len(elem.shape)) + return da.from_array(elem) + return da.from_zarr(elem) + elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}: + return sparse_dataset(elem) + elif iospec.encoding_type in {"awkward-array"}: + return read_dispatched(elem, None) + return func(elem) + + adata = read_dispatched(f, callback=callback) + + return adata diff --git a/anndata/experimental/read_backed/lazy_arrays.py b/anndata/experimental/backed/_lazy_arrays.py similarity index 100% rename from anndata/experimental/read_backed/lazy_arrays.py rename to anndata/experimental/backed/_lazy_arrays.py diff --git a/anndata/experimental/read_backed/xarray.py b/anndata/experimental/backed/_xarray.py similarity index 100% rename from anndata/experimental/read_backed/xarray.py rename to anndata/experimental/backed/_xarray.py diff --git a/anndata/tests/test_read_backed_experimental.py b/anndata/tests/test_read_backed_experimental.py index 6ac431bb2..141d710e4 100644 --- a/anndata/tests/test_read_backed_experimental.py +++ b/anndata/tests/test_read_backed_experimental.py @@ -13,8 +13,8 @@ gen_typed_df, assert_equal, ) -from anndata.experimental.read_backed import ( - read_backed, +from anndata.experimental import read_backed +from anndata.experimental.backed._lazy_arrays import ( LazyCategoricalArray, LazyMaskedArray, )