Merge branch 'main' into ooc-docs

scverse · Oct 6, 2023 · cc6d6ea · cc6d6ea
2 parents 321214e + 6a969eb
commit cc6d6ea
Show file tree

Hide file tree

Showing 10 changed files with 495 additions and 142 deletions.
diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py
@@ -1681,7 +1681,7 @@ def concatenate(
         ...     dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]),
         ... )
         >>> adata3 = AnnData(
-        ... np.array([[1, 2, 3], [4, 5, 6]]),
+        ...     np.array([[1, 2, 3], [4, 5, 6]]),
         ...     dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']),
         ...     dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]),
         ... )

diff --git a/anndata/experimental/merge.py b/anndata/experimental/merge.py
@@ -2,7 +2,7 @@
 
 import os
 import shutil
-from collections.abc import Collection, Iterable, Mapping, MutableMapping, Sequence
+from collections.abc import Collection, Iterable, Mapping, Sequence
 from functools import singledispatch
 from pathlib import Path
 from typing import (
@@ -105,7 +105,9 @@ def as_group(store, *args, **kwargs) -> ZarrGroup | H5Group:
 
 
 @as_group.register(os.PathLike)
-def _(store: os.PathLike, *args, **kwargs) -> ZarrGroup | H5Group:
+@as_group.register(str)
+def _(store: os.PathLike | str, *args, **kwargs) -> ZarrGroup | H5Group:
+    store = Path(store)
     if store.suffix == ".h5ad":
         import h5py
 
@@ -115,11 +117,6 @@ def _(store: os.PathLike, *args, **kwargs) -> ZarrGroup | H5Group:
     return zarr.open_group(store, *args, **kwargs)
 
 
-@as_group.register(str)
-def _(store: str, *args, **kwargs) -> ZarrGroup | H5Group:
-    return as_group(Path(store), *args, **kwargs)
-
-
 @as_group.register(ZarrGroup)
 @as_group.register(H5Group)
 def _(store, *args, **kwargs):
@@ -395,33 +392,33 @@ def _write_dim_annot(groups, output_group, dim, concat_indices, label, label_col
 
 
 def concat_on_disk(
-    in_files: Collection[str | os.PathLike] | MutableMapping[str, str | os.PathLike],
+    in_files: Collection[str | os.PathLike] | Mapping[str, str | os.PathLike],
     out_file: str | os.PathLike,
     *,
-    overwrite: bool = False,
     max_loaded_elems: int = 100_000_000,
     axis: Literal[0, 1] = 0,
     join: Literal["inner", "outer"] = "inner",
     merge: StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None = None,
-    uns_merge: StrategiesLiteral
-    | Callable[[Collection[Mapping]], Mapping]
-    | None = None,
+    uns_merge: (
+        StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None
+    ) = None,
     label: str | None = None,
     keys: Collection[str] | None = None,
     index_unique: str | None = None,
     fill_value: Any | None = None,
     pairwise: bool = False,
 ) -> None:
-    """Concatenates multiple AnnData objects along a specified axis using their
+    """\
+    Concatenates multiple AnnData objects along a specified axis using their
     corresponding stores or paths, and writes the resulting AnnData object
     to a target location on disk.
 
-    Unlike the `concat` function, this method does not require
+    Unlike :func:`anndata.concat`, this method does not require
     loading the input AnnData objects into memory,
     making it a memory-efficient alternative for large datasets.
     The resulting object written to disk should be equivalent
     to the concatenation of the loaded AnnData objects using
-    the `concat` function.
+    :func:`anndata.concat`.
 
     To adjust the maximum amount of data loaded in memory; for sparse
     arrays use the max_loaded_elems argument; for dense arrays
@@ -436,19 +433,16 @@ def concat_on_disk(
         argument and values are concatenated.
     out_file
         The target path or store to write the result in.
-    overwrite
-        If `False` while a file already exists it will raise an error,
-        otherwise it will overwrite.
     max_loaded_elems
         The maximum number of elements to load in memory when concatenating
         sparse arrays. Note that this number also includes the empty entries.
         Set to 100m by default meaning roughly 400mb will be loaded
-        to memory at simultaneously.
+        to memory simultaneously.
     axis
         Which axis to concatenate along.
     join
-        How to align values when concatenating. If "outer", the union of the other axis
-        is taken. If "inner", the intersection. See :doc:`concatenation <../concatenation>`
+        How to align values when concatenating. If `"outer"`, the union of the other axis
+        is taken. If `"inner"`, the intersection. See :doc:`concatenation <../concatenation>`
         for more.
     merge
         How elements not aligned to the axis being concatenated along are selected.
@@ -471,7 +465,7 @@ def concat_on_disk(
         incrementing integer labels.
     index_unique
         Whether to make the index unique by using the keys. If provided, this
-        is the delimiter between "{orig_idx}{index_unique}{key}". When `None`,
+        is the delimiter between `"{orig_idx}{index_unique}{key}"`. When `None`,
         the original indices are kept.
     fill_value
         When `join="outer"`, this is the value that will be used to fill the introduced
@@ -483,13 +477,58 @@ def concat_on_disk(
 
     Notes
     -----
-
     .. warning::
-
-        If you use `join='outer'` this fills 0s for sparse data when
-        variables are absent in a batch. Use this with care. Dense data is
-        filled with `NaN`.
+       If you use `join='outer'` this fills 0s for sparse data when
+       variables are absent in a batch. Use this with care. Dense data is
+       filled with `NaN`.
+
+    Examples
+    --------
+
+    See :func:`anndata.concat` for the semantics.
+    The following examples highlight the differences this function has.
+
+    First, let’s get some “big” datasets with a compatible ``var`` axis:
+
+    >>> import httpx
+    >>> import scanpy as sc
+    >>> api_url = "https://api.cellxgene.cziscience.com/curation/v1"
+    >>> def get_cellxgene_data(id_: str):
+    ...     out_path = sc.settings.datasetdir / f'{id_}.h5ad'
+    ...     if out_path.exists():
+    ...         return out_path
+    ...     ds_versions = httpx.get(f'{api_url}/datasets/{id_}/versions').raise_for_status().json()
+    ...     ds = ds_versions[0]  # newest
+    ...     file_url = next(a['url'] for a in ds['assets'] if a['filetype'] == 'H5AD')
+    ...     sc.settings.datasetdir.mkdir(parents=True, exist_ok=True)
+    ...     with httpx.stream('GET', file_url) as r, out_path.open('wb') as f:
+    ...         r.raise_for_status()
+    ...         for data in r.iter_bytes():
+    ...             f.write(data)
+    ...     return out_path
+    >>> path_b_cells = get_cellxgene_data('0895c838-e550-48a3-a777-dbcd35d30272')
+    >>> path_fetal = get_cellxgene_data('08e94873-c2a6-4f7d-ab72-aeaff3e3f929')
+
+    Now we can concatenate them on-disk:
+
+    >>> import anndata as ad
+    >>> ad.experimental.concat_on_disk(
+    ...     dict(b_cells=path_b_cells, fetal=path_fetal),
+    ...     'merged.h5ad',
+    ...     label='dataset',
+    ... )
+    >>> adata = ad.read_h5ad('merged.h5ad', backed=True)
+    >>> adata.X
+    CSRDataset: backend hdf5, shape (490, 15585), data_dtype float32
+    >>> adata.obs['dataset'].value_counts()
+    dataset
+    fetal      344
+    b_cells    146
+    Name: count, dtype: int64
     """
+    if len(in_files) == 0:
+        raise ValueError("No objects to concatenate.")
+
     # Argument normalization
     if pairwise:
         raise NotImplementedError("pairwise concatenation not yet implemented")
@@ -498,14 +537,11 @@ def concat_on_disk(
 
     merge = resolve_merge_strategy(merge)
     uns_merge = resolve_merge_strategy(uns_merge)
-    if len(in_files) <= 1:
-        if len(in_files) == 1:
-            if not overwrite and Path(out_file).is_file():
-                raise FileExistsError(
-                    f"File “{out_file}” already exists and `overwrite` is set to False"
-                )
-            shutil.copy2(in_files[0], out_file)
-        return
+
+    out_file = Path(out_file)
+    if not out_file.parent.exists():
+        raise FileNotFoundError(f"Parent directory of {out_file} does not exist.")
+
     if isinstance(in_files, Mapping):
         if keys is not None:
             raise TypeError(
@@ -516,15 +552,17 @@ def concat_on_disk(
     else:
         in_files = list(in_files)
 
+    if len(in_files) == 1:
+        shutil.copy2(in_files[0], out_file)
+        return
+
     if keys is None:
         keys = np.arange(len(in_files)).astype(str)
 
     _, dim = _resolve_dim(axis=axis)
     _, alt_dim = _resolve_dim(axis=1 - axis)
 
-    mode = "w" if overwrite else "w-"
-
-    output_group = as_group(out_file, mode=mode)
+    output_group = as_group(out_file, mode="w")
     groups = [as_group(f) for f in in_files]
 
     use_reindexing = False

diff --git a/anndata/tests/test_concatenate_disk.py b/anndata/tests/test_concatenate_disk.py
@@ -250,3 +250,18 @@ def gen_index(n):
 
 def test_concatenate_obsm_inner(obsm_adatas, tmp_path, file_format):
     assert_eq_concat_on_disk(obsm_adatas, tmp_path, file_format, join="inner")
+
+
+def test_output_dir_exists(tmp_path):
+    in_pth = tmp_path / "in.h5ad"
+    out_pth = tmp_path / "does_not_exist" / "out.h5ad"
+
+    AnnData(X=np.ones((5, 1))).write_h5ad(in_pth)
+
+    with pytest.raises(FileNotFoundError, match=f"{out_pth}"):
+        concat_on_disk([in_pth], out_pth)
+
+
+def test_failure_w_no_args(tmp_path):
+    with pytest.raises(ValueError, match="No objects to concatenate"):
+        concat_on_disk([], tmp_path / "out.h5ad")
diff --git a/docs/conf.py b/docs/conf.py
@@ -51,6 +51,7 @@
     "sphinx.ext.autosummary",
     "sphinx_autodoc_typehints",  # needs to be after napoleon
     "sphinx_issues",
+    "sphinx_design",
     "sphinxext.opengraph",
     "scanpydoc",  # needs to be before linkcode
     "sphinx.ext.linkcode",