diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 3fc67dd92..b37e33177 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -25,7 +25,7 @@ from numpy import ma from pandas.api.types import infer_dtype, is_string_dtype from scipy import sparse -from scipy.sparse import csr_matrix, issparse +from scipy.sparse import issparse from anndata._warnings import ImplicitModificationWarning @@ -592,28 +592,37 @@ def _init_as_actual( # layers self._layers = Layers(self, layers) - def __sizeof__(self, show_stratified=None) -> int: - def get_size(X): - if issparse(X): - X_csr = csr_matrix(X) - return X_csr.data.nbytes + X_csr.indptr.nbytes + X_csr.indices.nbytes + def __sizeof__(self, show_stratified=None, with_disk: bool = False) -> int: + def get_size(X) -> int: + def cs_to_bytes(X) -> int: + return int(X.data.nbytes + X.indptr.nbytes + X.indices.nbytes) + + if isinstance(X, h5py.Dataset) and with_disk: + return int(np.array(X.shape).prod() * X.dtype.itemsize) + elif isinstance(X, BaseCompressedSparseDataset) and with_disk: + return cs_to_bytes(X._to_backed()) + elif isinstance(X, (sparse.csr_matrix, sparse.csc_matrix)): + return cs_to_bytes(X) else: return X.__sizeof__() - size = 0 - attrs = list(["_X", "_obs", "_var"]) - attrs_multi = list(["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"]) + sizes = {} + attrs = ["X", "_obs", "_var"] + attrs_multi = ["_uns", "_obsm", "_varm", "varp", "_obsp", "_layers"] for attr in attrs + attrs_multi: if attr in attrs_multi: keys = getattr(self, attr).keys() - s = sum([get_size(getattr(self, attr)[k]) for k in keys]) + s = sum(get_size(getattr(self, attr)[k]) for k in keys) else: s = get_size(getattr(self, attr)) if s > 0 and show_stratified: - str_attr = attr.replace("_", ".") + " " * (7 - len(attr)) - print(f"Size of {str_attr}: {'%3.2f' % (s / (1024 ** 2))} MB") - size += s - return size + from tqdm import tqdm + + print( + f"Size of {attr.replace('_', '.'):<7}: {tqdm.format_sizeof(s, 'B')}" + ) + sizes[attr] = s + return sum(sizes.values()) def _gen_repr(self, n_obs, n_vars) -> str: if self.isbacked: diff --git a/anndata/tests/test_backed_sparse.py b/anndata/tests/test_backed_sparse.py index 777f2e430..d2a337346 100644 --- a/anndata/tests/test_backed_sparse.py +++ b/anndata/tests/test_backed_sparse.py @@ -1,5 +1,8 @@ from __future__ import annotations +from contextlib import contextmanager +from typing import TYPE_CHECKING, Callable, Literal + import h5py import numpy as np import pytest @@ -12,6 +15,11 @@ from anndata.experimental import read_dispatched from anndata.tests.helpers import assert_equal, subset_func +if TYPE_CHECKING: + from pathlib import Path + + from numpy.typing import ArrayLike + subset_func2 = subset_func @@ -21,7 +29,9 @@ def diskfmt(request): @pytest.fixture(scope="function") -def ondisk_equivalent_adata(tmp_path, diskfmt): +def ondisk_equivalent_adata( + tmp_path: Path, diskfmt: Literal["h5ad", "zarr"] +) -> tuple[AnnData, AnnData, AnnData, AnnData]: csr_path = tmp_path / f"csr.{diskfmt}" csc_path = tmp_path / f"csc.{diskfmt}" dense_path = tmp_path / f"dense.{diskfmt}" @@ -68,7 +78,11 @@ def callback(func, elem_name, elem, iospec): return csr_mem, csr_disk, csc_disk, dense_disk -def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2): +def test_backed_indexing( + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + subset_func, + subset_func2, +): csr_mem, csr_disk, csc_disk, dense_disk = ondisk_equivalent_adata obs_idx = subset_func(csr_mem.obs_names) @@ -87,7 +101,12 @@ def test_backed_indexing(ondisk_equivalent_adata, subset_func, subset_func2): pytest.param(sparse.csc_matrix, sparse.hstack), ], ) -def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt): +def test_dataset_append_memory( + tmp_path: Path, + sparse_format: Callable[[ArrayLike], sparse.spmatrix], + append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix], + diskfmt: Literal["h5ad", "zarr"], +): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -115,7 +134,12 @@ def test_dataset_append_memory(tmp_path, sparse_format, append_method, diskfmt): pytest.param(sparse.csc_matrix, sparse.hstack), ], ) -def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt): +def test_dataset_append_disk( + tmp_path: Path, + sparse_format: Callable[[ArrayLike], sparse.spmatrix], + append_method: Callable[[list[sparse.spmatrix]], sparse.spmatrix], + diskfmt: Literal["h5ad", "zarr"], +): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -146,7 +170,13 @@ def test_dataset_append_disk(tmp_path, sparse_format, append_method, diskfmt): pytest.param("csc", (100, 100), (200, 100)), ], ) -def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt): +def test_wrong_shape( + tmp_path: Path, + sparse_format: Literal["csr", "csc"], + a_shape: tuple[int, int], + b_shape: tuple[int, int], + diskfmt: Literal["h5ad", "zarr"], +): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -167,7 +197,7 @@ def test_wrong_shape(tmp_path, sparse_format, a_shape, b_shape, diskfmt): a_disk.append(b_disk) -def test_wrong_formats(tmp_path, diskfmt): +def test_wrong_formats(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -198,7 +228,7 @@ def test_wrong_formats(tmp_path, diskfmt): assert not np.any((pre_checks != post_checks).toarray()) -def test_anndata_sparse_compat(tmp_path, diskfmt): +def test_anndata_sparse_compat(tmp_path: Path, diskfmt: Literal["h5ad", "zarr"]): path = ( tmp_path / f"test.{diskfmt.replace('ad', '')}" ) # diskfmt is either h5ad or zarr @@ -212,3 +242,28 @@ def test_anndata_sparse_compat(tmp_path, diskfmt): ad._io.specs.write_elem(f, "/", base) adata = ad.AnnData(sparse_dataset(f["/"])) assert_equal(adata.X, base) + + +@contextmanager +def xfail_if_zarr(diskfmt: Literal["h5ad", "zarr"]): + if diskfmt == "zarr": + with pytest.raises(AssertionError): + yield + # TODO: Zarr backed mode https://github.com/scverse/anndata/issues/219 + pytest.xfail("Backed zarr not really supported yet") + else: + yield + + +def test_backed_sizeof( + ondisk_equivalent_adata: tuple[AnnData, AnnData, AnnData, AnnData], + diskfmt: Literal["h5ad", "zarr"], +): + csr_mem, csr_disk, csc_disk, _ = ondisk_equivalent_adata + + assert csr_mem.__sizeof__() == csr_disk.__sizeof__(with_disk=True) + assert csr_mem.__sizeof__() == csc_disk.__sizeof__(with_disk=True) + assert csr_disk.__sizeof__(with_disk=True) == csc_disk.__sizeof__(with_disk=True) + with xfail_if_zarr(diskfmt): + assert csr_mem.__sizeof__() > csr_disk.__sizeof__() + assert csr_mem.__sizeof__() > csc_disk.__sizeof__() diff --git a/docs/release-notes/0.10.4.md b/docs/release-notes/0.10.4.md index 0941f91fd..2a9760e6e 100644 --- a/docs/release-notes/0.10.4.md +++ b/docs/release-notes/0.10.4.md @@ -3,6 +3,7 @@ ```{rubric} Bugfix ``` * Only try to use `Categorical.map(na_action=…)` in actually supported Pandas ≥2.1 {pr}`1226` {user}`flying-sheep` +* `AnnData.__sizeof__()` support for backed datasets {pr}`1230` {user}`Neah-Ko` ```{rubric} Documentation ```