From cf0208b7d86d41607e84e4a5c28ef5b49231c7e1 Mon Sep 17 00:00:00 2001 From: Ilan Gold Date: Mon, 26 Aug 2024 08:03:09 -0400 Subject: [PATCH] Backport PR #1589: (fix): disallow using dataframes with multi index columns (#1609) --- docs/release-notes/0.10.9.md | 1 + src/anndata/_core/aligned_mapping.py | 30 ++++++++++++++++++---------- src/anndata/_core/anndata.py | 18 +++++++++++++---- src/anndata/_core/storage.py | 8 +++++++- src/anndata/tests/helpers.py | 11 ++++++++++ src/anndata/utils.py | 9 +++++++++ tests/test_annot.py | 8 ++++++++ tests/test_base.py | 10 +++++++++- tests/test_obsmvarm.py | 7 +++++++ 9 files changed, 85 insertions(+), 17 deletions(-) diff --git a/docs/release-notes/0.10.9.md b/docs/release-notes/0.10.9.md index 55948a2a1..519704b3a 100644 --- a/docs/release-notes/0.10.9.md +++ b/docs/release-notes/0.10.9.md @@ -9,6 +9,7 @@ * Upper bound {mod}`numpy` for `gpu` installation on account of {issue}`cupy/cupy#8391` {pr}`1540` {user}`ilan-gold` * Fix writing large number of columns for `h5` files {pr}`1147` {user}`ilan-gold` {user}`selmanozleyen` * Upper bound dask on account of {issue}`1579` {pr}`1580` {user}`ilan-gold` +* Disallow using {class}`~pandas.DataFrame`s with multi-index columns {pr}`1589` {user}`ilan-gold` * Ensure setting {attr}`pandas.DataFrame.index` on a view of a {class}`~anndata.AnnData` instantiates the {class}`~pandas.DataFrame` from the view {pr}`1586` {user}`ilan-gold` #### Documentation diff --git a/src/anndata/_core/aligned_mapping.py b/src/anndata/_core/aligned_mapping.py index 461829028..b84f3ee80 100644 --- a/src/anndata/_core/aligned_mapping.py +++ b/src/anndata/_core/aligned_mapping.py @@ -13,7 +13,13 @@ from .._warnings import ExperimentalFeatureWarning, ImplicitModificationWarning from ..compat import AwkArray -from ..utils import convert_to_dict, deprecated, dim_len, warn_once +from ..utils import ( + convert_to_dict, + deprecated, + dim_len, + raise_value_error_if_multiindex_columns, + warn_once, +) from .access import ElementRef from .index import _subset from .storage import coerce_array @@ -258,16 +264,18 @@ def to_df(self) -> pd.DataFrame: return df def _validate_value(self, val: Value, key: str) -> Value: - if isinstance(val, pd.DataFrame) and not val.index.equals(self.dim_names): - # Could probably also re-order index if it’s contained - try: - pd.testing.assert_index_equal(val.index, self.dim_names) - except AssertionError as e: - msg = f"value.index does not match parent’s {self.dim} names:\n{e}" - raise ValueError(msg) from None - else: - msg = "Index.equals and pd.testing.assert_index_equal disagree" - raise AssertionError(msg) + if isinstance(val, pd.DataFrame): + raise_value_error_if_multiindex_columns(val, f"{self.attrname}[{key!r}]") + if not val.index.equals(self.dim_names): + # Could probably also re-order index if it’s contained + try: + pd.testing.assert_index_equal(val.index, self.dim_names) + except AssertionError as e: + msg = f"value.index does not match parent’s {self.dim} names:\n{e}" + raise ValueError(msg) from None + else: + msg = "Index.equals and pd.testing.assert_index_equal disagree" + raise AssertionError(msg) return super()._validate_value(val, key) @property diff --git a/src/anndata/_core/anndata.py b/src/anndata/_core/anndata.py index 37d72594a..fa563c5de 100644 --- a/src/anndata/_core/anndata.py +++ b/src/anndata/_core/anndata.py @@ -29,7 +29,12 @@ _move_adj_mtx, ) from ..logging import anndata_logger as logger -from ..utils import deprecated, dim_len, ensure_df_homogeneous +from ..utils import ( + deprecated, + dim_len, + ensure_df_homogeneous, + raise_value_error_if_multiindex_columns, +) from .access import ElementRef from .aligned_df import _gen_dataframe from .aligned_mapping import AlignedMappingProperty, AxisArrays, Layers, PairwiseArrays @@ -234,9 +239,13 @@ def __init__( *, obsp: np.ndarray | Mapping[str, Sequence[Any]] | None = None, varp: np.ndarray | Mapping[str, Sequence[Any]] | None = None, - oidx: Index1D = None, - vidx: Index1D = None, + oidx: Index1D | None = None, + vidx: Index1D | None = None, ): + # check for any multi-indices that aren’t later checked in coerce_array + for attr, key in [(obs, "obs"), (var, "var"), (X, "X")]: + if isinstance(attr, pd.DataFrame): + raise_value_error_if_multiindex_columns(attr, key) if asview: if not isinstance(X, AnnData): raise ValueError("`X` has to be an AnnData object.") @@ -731,9 +740,10 @@ def n_vars(self) -> int: """Number of variables/features.""" return len(self.var_names) - def _set_dim_df(self, value: pd.DataFrame, attr: str): + def _set_dim_df(self, value: pd.DataFrame, attr: Literal["obs", "var"]): if not isinstance(value, pd.DataFrame): raise ValueError(f"Can only assign pd.DataFrame to {attr}.") + raise_value_error_if_multiindex_columns(value, attr) value_idx = self._prep_dim_index(value.index, attr) if self.is_view: self._init_as_actual(self.copy()) diff --git a/src/anndata/_core/storage.py b/src/anndata/_core/storage.py index 93a0717a2..7f10a3346 100644 --- a/src/anndata/_core/storage.py +++ b/src/anndata/_core/storage.py @@ -19,7 +19,11 @@ ZappyArray, ZarrArray, ) -from ..utils import ensure_df_homogeneous, join_english +from ..utils import ( + ensure_df_homogeneous, + join_english, + raise_value_error_if_multiindex_columns, +) from .sparse_dataset import BaseCompressedSparseDataset if TYPE_CHECKING: @@ -82,6 +86,8 @@ def coerce_array( value = value.A return value if isinstance(value, pd.DataFrame): + if allow_df: + raise_value_error_if_multiindex_columns(value, name) return value if allow_df else ensure_df_homogeneous(value, name) # if value is an array-like object, try to convert it e = None diff --git a/src/anndata/tests/helpers.py b/src/anndata/tests/helpers.py index e1b47387e..ab38ce41f 100644 --- a/src/anndata/tests/helpers.py +++ b/src/anndata/tests/helpers.py @@ -1,5 +1,6 @@ from __future__ import annotations +import itertools import random import re import warnings @@ -785,3 +786,13 @@ def __init__(self, *_args, **_kwargs) -> None: raise ImportError( "zarr must be imported to create an `AccessTrackingStore` instance." ) + + +def get_multiindex_columns_df(shape): + return pd.DataFrame( + np.random.rand(shape[0], shape[1]), + columns=pd.MultiIndex.from_tuples( + list(itertools.product(["a"], range(shape[1] - (shape[1] // 2)))) + + list(itertools.product(["b"], range(shape[1] // 2))) + ), + ) diff --git a/src/anndata/utils.py b/src/anndata/utils.py index a1684b340..ade4e7197 100644 --- a/src/anndata/utils.py +++ b/src/anndata/utils.py @@ -399,3 +399,12 @@ def is_hidden(attr) -> bool: for item in type.__dir__(cls) if not is_hidden(getattr(cls, item, None)) ] + + +def raise_value_error_if_multiindex_columns(df: pd.DataFrame, attr: str): + if isinstance(df.columns, pd.MultiIndex): + msg = ( + "MultiIndex columns are not supported in AnnData. " + f"Please use a single-level index for {attr}." + ) + raise ValueError(msg) diff --git a/tests/test_annot.py b/tests/test_annot.py index 7f4c2697e..8287add0d 100644 --- a/tests/test_annot.py +++ b/tests/test_annot.py @@ -8,6 +8,7 @@ from natsort import natsorted import anndata as ad +from anndata.tests.helpers import get_multiindex_columns_df @pytest.mark.parametrize("dtype", [object, "string"]) @@ -63,3 +64,10 @@ def test_non_str_to_not_categorical(): result_non_transformed = adata.obs.drop(columns=["str_with_nan"]) pd.testing.assert_frame_equal(expected_non_transformed, result_non_transformed) + + +def test_error_multiindex(): + adata = ad.AnnData(np.random.rand(100, 10)) + df = get_multiindex_columns_df((adata.shape[0], 20)) + with pytest.raises(ValueError, match=r"MultiIndex columns are not supported"): + adata.obs = df diff --git a/tests/test_base.py b/tests/test_base.py index 4072f496f..0cfcae1a7 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -12,7 +12,7 @@ from scipy.sparse import csr_matrix, issparse from anndata import AnnData, ImplicitModificationWarning -from anndata.tests.helpers import assert_equal, gen_adata +from anndata.tests.helpers import assert_equal, gen_adata, get_multiindex_columns_df # some test objects that we use below adata_dense = AnnData(np.array([[1, 2], [3, 4]])) @@ -113,6 +113,14 @@ def test_create_from_df(): assert df.index.tolist() == ad.obs_names.tolist() +@pytest.mark.parametrize("attr", ["X", "obs", "obsm"]) +def test_error_create_from_multiindex_df(attr): + df = get_multiindex_columns_df((100, 20)) + val = df if attr != "obsm" else {"df": df} + with pytest.raises(ValueError, match=r"MultiIndex columns are not supported"): + AnnData(**{attr: val}, shape=(100, 10)) + + def test_create_from_sparse_df(): s = sp.random(20, 30, density=0.2) obs_names = [f"obs{i}" for i in range(20)] diff --git a/tests/test_obsmvarm.py b/tests/test_obsmvarm.py index 63fbc9bc6..91516de2f 100644 --- a/tests/test_obsmvarm.py +++ b/tests/test_obsmvarm.py @@ -7,6 +7,7 @@ from scipy import sparse from anndata import AnnData +from anndata.tests.helpers import get_multiindex_columns_df M, N = (100, 100) @@ -137,3 +138,9 @@ def test_shape_error(adata: AnnData): ), ): adata.obsm["b"] = np.zeros((adata.shape[0] + 1, adata.shape[0])) + + +def test_error_set_multiindex_df(adata: AnnData): + df = get_multiindex_columns_df((adata.shape[0], 20)) + with pytest.raises(ValueError, match=r"MultiIndex columns are not supported"): + adata.obsm["df"] = df