From 9e0e9f1cbd13f3951a3d1a6bd7d76461b5a56872 Mon Sep 17 00:00:00 2001
From: Philipp A <flying-sheep@web.de>
Date: Thu, 5 Oct 2023 21:31:27 +0200
Subject: [PATCH 1/5] Add-concat-on-disk-examples (#1161)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Fix bug of in_files being dict of len 1

* add simple example

* add test dep

* Create parent dir

* simplify

* don’t write to dataset dir

* test value counts

* more comprehensive

* no more overwrite

* Standardize behaviour: error if directory doesn't exist, error if no objects passed

---------

Co-authored-by: Isaac Virshup <ivirshup@gmail.com>
---
 anndata/_core/anndata.py               |   2 +-
 anndata/experimental/merge.py          | 114 ++++++++++++++++---------
 anndata/tests/test_concatenate_disk.py |  15 ++++
 pyproject.toml                         |   1 +
 4 files changed, 93 insertions(+), 39 deletions(-)

diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py
index b04e3b161..944fc66a4 100644
--- a/anndata/_core/anndata.py
+++ b/anndata/_core/anndata.py
@@ -1681,7 +1681,7 @@ def concatenate(
         ...     dict(var_names=['d', 'c', 'b'], annoA=[0, 1, 2]),
         ... )
         >>> adata3 = AnnData(
-        ... np.array([[1, 2, 3], [4, 5, 6]]),
+        ...     np.array([[1, 2, 3], [4, 5, 6]]),
         ...     dict(obs_names=['s1', 's2'], anno2=['d3', 'd4']),
         ...     dict(var_names=['d', 'c', 'b'], annoA=[0, 2, 3], annoB=[0, 1, 2]),
         ... )
diff --git a/anndata/experimental/merge.py b/anndata/experimental/merge.py
index 711bcd52d..2413b3348 100644
--- a/anndata/experimental/merge.py
+++ b/anndata/experimental/merge.py
@@ -2,7 +2,7 @@
 
 import os
 import shutil
-from collections.abc import Collection, Iterable, Mapping, MutableMapping, Sequence
+from collections.abc import Collection, Iterable, Mapping, Sequence
 from functools import singledispatch
 from pathlib import Path
 from typing import (
@@ -105,7 +105,9 @@ def as_group(store, *args, **kwargs) -> ZarrGroup | H5Group:
 
 
 @as_group.register(os.PathLike)
-def _(store: os.PathLike, *args, **kwargs) -> ZarrGroup | H5Group:
+@as_group.register(str)
+def _(store: os.PathLike | str, *args, **kwargs) -> ZarrGroup | H5Group:
+    store = Path(store)
     if store.suffix == ".h5ad":
         import h5py
 
@@ -115,11 +117,6 @@ def _(store: os.PathLike, *args, **kwargs) -> ZarrGroup | H5Group:
     return zarr.open_group(store, *args, **kwargs)
 
 
-@as_group.register(str)
-def _(store: str, *args, **kwargs) -> ZarrGroup | H5Group:
-    return as_group(Path(store), *args, **kwargs)
-
-
 @as_group.register(ZarrGroup)
 @as_group.register(H5Group)
 def _(store, *args, **kwargs):
@@ -395,33 +392,33 @@ def _write_dim_annot(groups, output_group, dim, concat_indices, label, label_col
 
 
 def concat_on_disk(
-    in_files: Collection[str | os.PathLike] | MutableMapping[str, str | os.PathLike],
+    in_files: Collection[str | os.PathLike] | Mapping[str, str | os.PathLike],
     out_file: str | os.PathLike,
     *,
-    overwrite: bool = False,
     max_loaded_elems: int = 100_000_000,
     axis: Literal[0, 1] = 0,
     join: Literal["inner", "outer"] = "inner",
     merge: StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None = None,
-    uns_merge: StrategiesLiteral
-    | Callable[[Collection[Mapping]], Mapping]
-    | None = None,
+    uns_merge: (
+        StrategiesLiteral | Callable[[Collection[Mapping]], Mapping] | None
+    ) = None,
     label: str | None = None,
     keys: Collection[str] | None = None,
     index_unique: str | None = None,
     fill_value: Any | None = None,
     pairwise: bool = False,
 ) -> None:
-    """Concatenates multiple AnnData objects along a specified axis using their
+    """\
+    Concatenates multiple AnnData objects along a specified axis using their
     corresponding stores or paths, and writes the resulting AnnData object
     to a target location on disk.
 
-    Unlike the `concat` function, this method does not require
+    Unlike :func:`anndata.concat`, this method does not require
     loading the input AnnData objects into memory,
     making it a memory-efficient alternative for large datasets.
     The resulting object written to disk should be equivalent
     to the concatenation of the loaded AnnData objects using
-    the `concat` function.
+    :func:`anndata.concat`.
 
     To adjust the maximum amount of data loaded in memory; for sparse
     arrays use the max_loaded_elems argument; for dense arrays
@@ -436,19 +433,16 @@ def concat_on_disk(
         argument and values are concatenated.
     out_file
         The target path or store to write the result in.
-    overwrite
-        If `False` while a file already exists it will raise an error,
-        otherwise it will overwrite.
     max_loaded_elems
         The maximum number of elements to load in memory when concatenating
         sparse arrays. Note that this number also includes the empty entries.
         Set to 100m by default meaning roughly 400mb will be loaded
-        to memory at simultaneously.
+        to memory simultaneously.
     axis
         Which axis to concatenate along.
     join
-        How to align values when concatenating. If "outer", the union of the other axis
-        is taken. If "inner", the intersection. See :doc:`concatenation <../concatenation>`
+        How to align values when concatenating. If `"outer"`, the union of the other axis
+        is taken. If `"inner"`, the intersection. See :doc:`concatenation <../concatenation>`
         for more.
     merge
         How elements not aligned to the axis being concatenated along are selected.
@@ -471,7 +465,7 @@ def concat_on_disk(
         incrementing integer labels.
     index_unique
         Whether to make the index unique by using the keys. If provided, this
-        is the delimiter between "{orig_idx}{index_unique}{key}". When `None`,
+        is the delimiter between `"{orig_idx}{index_unique}{key}"`. When `None`,
         the original indices are kept.
     fill_value
         When `join="outer"`, this is the value that will be used to fill the introduced
@@ -483,13 +477,58 @@ def concat_on_disk(
 
     Notes
     -----
-
     .. warning::
-
-        If you use `join='outer'` this fills 0s for sparse data when
-        variables are absent in a batch. Use this with care. Dense data is
-        filled with `NaN`.
+       If you use `join='outer'` this fills 0s for sparse data when
+       variables are absent in a batch. Use this with care. Dense data is
+       filled with `NaN`.
+
+    Examples
+    --------
+
+    See :func:`anndata.concat` for the semantics.
+    The following examples highlight the differences this function has.
+
+    First, let’s get some “big” datasets with a compatible ``var`` axis:
+
+    >>> import httpx
+    >>> import scanpy as sc
+    >>> api_url = "https://api.cellxgene.cziscience.com/curation/v1"
+    >>> def get_cellxgene_data(id_: str):
+    ...     out_path = sc.settings.datasetdir / f'{id_}.h5ad'
+    ...     if out_path.exists():
+    ...         return out_path
+    ...     ds_versions = httpx.get(f'{api_url}/datasets/{id_}/versions').raise_for_status().json()
+    ...     ds = ds_versions[0]  # newest
+    ...     file_url = next(a['url'] for a in ds['assets'] if a['filetype'] == 'H5AD')
+    ...     sc.settings.datasetdir.mkdir(parents=True, exist_ok=True)
+    ...     with httpx.stream('GET', file_url) as r, out_path.open('wb') as f:
+    ...         r.raise_for_status()
+    ...         for data in r.iter_bytes():
+    ...             f.write(data)
+    ...     return out_path
+    >>> path_b_cells = get_cellxgene_data('0895c838-e550-48a3-a777-dbcd35d30272')
+    >>> path_fetal = get_cellxgene_data('08e94873-c2a6-4f7d-ab72-aeaff3e3f929')
+
+    Now we can concatenate them on-disk:
+
+    >>> import anndata as ad
+    >>> ad.experimental.concat_on_disk(
+    ...     dict(b_cells=path_b_cells, fetal=path_fetal),
+    ...     'merged.h5ad',
+    ...     label='dataset',
+    ... )
+    >>> adata = ad.read_h5ad('merged.h5ad', backed=True)
+    >>> adata.X
+    CSRDataset: backend hdf5, shape (490, 15585), data_dtype float32
+    >>> adata.obs['dataset'].value_counts()
+    dataset
+    fetal      344
+    b_cells    146
+    Name: count, dtype: int64
     """
+    if len(in_files) == 0:
+        raise ValueError("No objects to concatenate.")
+
     # Argument normalization
     if pairwise:
         raise NotImplementedError("pairwise concatenation not yet implemented")
@@ -498,14 +537,11 @@ def concat_on_disk(
 
     merge = resolve_merge_strategy(merge)
     uns_merge = resolve_merge_strategy(uns_merge)
-    if len(in_files) <= 1:
-        if len(in_files) == 1:
-            if not overwrite and Path(out_file).is_file():
-                raise FileExistsError(
-                    f"File “{out_file}” already exists and `overwrite` is set to False"
-                )
-            shutil.copy2(in_files[0], out_file)
-        return
+
+    out_file = Path(out_file)
+    if not out_file.parent.exists():
+        raise FileNotFoundError(f"Parent directory of {out_file} does not exist.")
+
     if isinstance(in_files, Mapping):
         if keys is not None:
             raise TypeError(
@@ -516,15 +552,17 @@ def concat_on_disk(
     else:
         in_files = list(in_files)
 
+    if len(in_files) == 1:
+        shutil.copy2(in_files[0], out_file)
+        return
+
     if keys is None:
         keys = np.arange(len(in_files)).astype(str)
 
     _, dim = _resolve_dim(axis=axis)
     _, alt_dim = _resolve_dim(axis=1 - axis)
 
-    mode = "w" if overwrite else "w-"
-
-    output_group = as_group(out_file, mode=mode)
+    output_group = as_group(out_file, mode="w")
     groups = [as_group(f) for f in in_files]
 
     use_reindexing = False
diff --git a/anndata/tests/test_concatenate_disk.py b/anndata/tests/test_concatenate_disk.py
index 0192df452..f9eab9540 100644
--- a/anndata/tests/test_concatenate_disk.py
+++ b/anndata/tests/test_concatenate_disk.py
@@ -250,3 +250,18 @@ def gen_index(n):
 
 def test_concatenate_obsm_inner(obsm_adatas, tmp_path, file_format):
     assert_eq_concat_on_disk(obsm_adatas, tmp_path, file_format, join="inner")
+
+
+def test_output_dir_exists(tmp_path):
+    in_pth = tmp_path / "in.h5ad"
+    out_pth = tmp_path / "does_not_exist" / "out.h5ad"
+
+    AnnData(X=np.ones((5, 1))).write_h5ad(in_pth)
+
+    with pytest.raises(FileNotFoundError, match=f"{out_pth}"):
+        concat_on_disk([in_pth], out_pth)
+
+
+def test_failure_w_no_args(tmp_path):
+    with pytest.raises(ValueError, match="No objects to concatenate"):
+        concat_on_disk([], tmp_path / "out.h5ad")
diff --git a/pyproject.toml b/pyproject.toml
index e375d4700..9a2f514ea 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,6 +87,7 @@ test = [
     "joblib",
     "boltons",
     "scanpy",
+    "httpx", # For data downloading
     "dask[array,distributed]",
     "awkward>=2.3",
     "pytest_memray",

From 5e8102dad4c16282b2c02ef23c0ab9c4db1383b0 Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Fri, 6 Oct 2023 13:25:34 +0200
Subject: [PATCH 2/5] Add zarr examples to fileformat docs (#1162)

* Add zarr examples to fileformat docs

* Release note + minor fixes

* Apply changes from review

* Add missing line of output
---
 docs/conf.py                 |   1 +
 docs/fileformat-prose.md     | 458 ++++++++++++++++++++++++++++-------
 docs/release-notes/0.10.0.md |   2 +
 pyproject.toml               |   1 +
 4 files changed, 370 insertions(+), 92 deletions(-)

diff --git a/docs/conf.py b/docs/conf.py
index a25b0f6cf..d5c872c60 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -51,6 +51,7 @@
     "sphinx.ext.autosummary",
     "sphinx_autodoc_typehints",  # needs to be after napoleon
     "sphinx_issues",
+    "sphinx_design",
     "sphinxext.opengraph",
     "scanpydoc",  # needs to be before linkcode
     "sphinx.ext.linkcode",
diff --git a/docs/fileformat-prose.md b/docs/fileformat-prose.md
index 340be1fdc..9843e9c81 100644
--- a/docs/fileformat-prose.md
+++ b/docs/fileformat-prose.md
@@ -1,7 +1,7 @@
 # On-disk format
 
 ```{note}
-These docs are written for anndata 0.8.
+These docs are written for anndata 0.8+.
 Files written before this version may differ in some conventions,
 but will still be read by newer versions of the library.
 ```
@@ -10,21 +10,42 @@ AnnData objects are saved on disk to hierarchical array stores like [HDF5]
 (via {doc}`H5py <h5py:index>`) and {doc}`zarr:index`.
 This allows us to have very similar structures in disk and on memory.
 
-As an example we’ll look into a typical `.h5ad` object that’s been through an analysis.
-This structure should be largely equivalent to Zarr structure, though there are a few minor differences.
+As an example we’ll look into a typical `.h5ad`/ `.zarr` object that’s been through an analysis.
+The structures are largely equivalent, though there are a few minor differences when it comes to type encoding.
 
 ## Elements
 
-
  <!-- I’ve started using h5py since I couldn’t figure out a nice way to print attributes from bash. -->
 
+
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
 ```python
 >>> import h5py
->>> f = h5py.File("02_processed.h5ad", "r")
->>> list(f.keys())
-['X', 'layers', 'obs', 'obsm', 'uns', 'var', 'varm']
+>>> store = h5py.File("for-ondisk-docs/cart-164k-processed.h5ad", mode="r")
+>>> list(store.keys())
+['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
 ```
 
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
+```python
+>>> import zarr
+>>> store = zarr.open("for-ondisk-docs/cart-164k-processed.zarr", mode="r")
+>>> list(store.keys())
+['X', 'layers', 'obs', 'obsm', 'obsp', 'uns', 'var', 'varm', 'varp']
+```
+
+````
+
+`````
+
 <!-- ```bash
 $ h5ls 02_processed.h5ad
 X                        Group
@@ -42,7 +63,7 @@ We record the type of an element using the `encoding-type` and `encoding-version
 For example, we can this file represents an `AnnData` object from this metadata:
 
 ```python
->>> dict(f.attrs)
+>>> dict(store.attrs)
 {'encoding-type': 'anndata', 'encoding-version': '0.1.0'}
 ```
 
@@ -73,12 +94,32 @@ Dense numeric arrays have the most simple representation on disk,
 as they have native equivalents in H5py {doc}`h5py:high/dataset` and Zarr {ref}`Arrays <zarr:tutorial_create>`.
 We can see an example of this with dimensionality reductions stored in the `obsm` group:
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
+```python
+>>> store["obsm/X_pca"]
+<HDF5 dataset "X_pca": shape (164114, 50), type "<f4">
+```
+
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
 ```python
->>> f["obsm"].visititems(print)
-X_pca <HDF5 dataset "X_pca": shape (38410, 50), type "<f4">
-X_umap <HDF5 dataset "X_umap": shape (38410, 2), type "<f4">
+>>> store["obsm/X_pca"]
+<zarr.core.Array '/obsm/X_pca' (164114, 50) float32 read-only>
+```
+
+````
+
+`````
 
->>> dict(f["obsm"]["X_pca"].attrs)
+```python
+>>> dict(store["obsm"]["X_pca"].attrs)
 {'encoding-type': 'array', 'encoding-version': '0.2.0'}
 ```
 
@@ -111,27 +152,41 @@ We represent a sparse array as a `Group` on-disk,
 where the kind and shape of the sparse array is defined in the `Group`'s attributes:
 
 ```python
->>> dict(f["X"].attrs)
+>>> dict(store["X"].attrs)
 {'encoding-type': 'csr_matrix',
  'encoding-version': '0.1.0',
- 'shape': array([38410, 27899])}
+ 'shape': [164114, 40145]}
 ```
 
 The group contains three arrays:
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
 ```python
->>> f["X"].visititems(print)
-data <HDF5 dataset "data": shape (41459314,), type "<f4">
-indices <HDF5 dataset "indices": shape (41459314,), type "<i4">
-indptr <HDF5 dataset "indptr": shape (38411,), type "<i4">
+>>> store["X"].visititems(print)
+data <HDF5 dataset "data": shape (495079432,), type "<f4">
+indices <HDF5 dataset "indices": shape (495079432,), type "<i4">
+indptr <HDF5 dataset "indptr": shape (164115,), type "<i4">
 ```
 
-<!-- ```bash
-$ h5ls 02_processed.h5ad/X
-data                     Dataset {41459314/Inf}
-indices                  Dataset {41459314/Inf}
-indptr                   Dataset {38411/Inf}
-``` -->
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
+```python
+>>> store["X"].visititems(print)
+data <zarr.core.Array '/X/data' (495079432,) float32 read-only>
+indices <zarr.core.Array '/X/indices' (495079432,) int32 read-only>
+indptr <zarr.core.Array '/X/indptr' (164115,) int32 read-only>
+```
+
+````
+
+`````
 
 ### Sparse array specification (v0.1.0)
 
@@ -148,14 +203,17 @@ DataFrames are saved as a columnar format in a group, so each column of a DataFr
 We save a little more information in the attributes here.
 
 ```python
->>> dict(f["obs"].attrs)
-{'_index': 'Cell',
- 'column-order': array(['sample', 'cell_type', 'n_genes_by_counts',
-        'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts',
-        'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes',
-        'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes',
-        'total_counts_mito', 'log1p_total_counts_mito', 'pct_counts_mito',
-        'label_by_score'], dtype=object),
+>>> dict(store["var"].attrs)
+{'_index': 'ensembl_id',
+ 'column-order': ['highly_variable',
+  'means',
+  'variances',
+  'variances_norm',
+  'feature_is_filtered',
+  'feature_name',
+  'feature_reference',
+  'feature_biotype',
+  'mito'],
  'encoding-type': 'dataframe',
  'encoding-version': '0.2.0'}
 ```
@@ -163,19 +221,53 @@ We save a little more information in the attributes here.
 These attributes identify the index of the dataframe, as well as the original order of the columns.
 Each column in this dataframe is encoded as its own array.
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
 ```python
->>> dict(f["obs"]["total_counts"].attrs)
-{'encoding-type': 'array', 'encoding-version': '0.2.0'}
+>>> store["var"].visititems(print)
+ensembl_id <HDF5 dataset "ensembl_id": shape (40145,), type "|O">
+feature_biotype <HDF5 group "/var/feature_biotype" (2 members)>
+feature_biotype/categories <HDF5 dataset "categories": shape (1,), type "|O">
+feature_biotype/codes <HDF5 dataset "codes": shape (40145,), type "|i1">
+feature_is_filtered <HDF5 dataset "feature_is_filtered": shape (40145,), type "|b1">
+...
+```
+
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
+```python
+>>> store["var"].visititems(print)
+ensembl_id <zarr.core.Array '/var/ensembl_id' (40145,) object read-only>
+feature_biotype <zarr.hierarchy.Group '/var/feature_biotype' read-only>
+feature_biotype/categories <zarr.core.Array '/var/feature_biotype/categories' (1,) object read-only>
+feature_biotype/codes <zarr.core.Array '/var/feature_biotype/codes' (40145,) int8 read-only>
+feature_is_filtered <zarr.core.Array '/var/feature_is_filtered' (40145,) bool read-only>
+...
+```
+
+````
 
->>> dict(f["obs"]["cell_type"].attrs)
+`````
+
+```python
+>>> dict(store["var"]["feature_name"].attrs)
 {'encoding-type': 'categorical', 'encoding-version': '0.2.0', 'ordered': False}
+
+>>> dict(store["var"]["feature_is_filtered"].attrs)
+{'encoding-type': 'array', 'encoding-version': '0.2.0'}
 ```
 
 ### Dataframe Specification (v0.2.0)
 
 * A dataframe MUST be stored as a group
 * The group's metadata:
-    * MUST contain the field `"_index"`, whose value is the key of the array to be used as an index
+    * MUST contain the field `"_index"`, whose value is the key of the array to be used as an index/ row labels
     * MUST contain encoding metadata `"encoding-type": "dataframe"`, `"encoding-version": "0.2.0"`
     * MUST contain `"column-order"` an array of strings denoting the order of column entries
 * The group MUST contain an array for the index
@@ -190,15 +282,40 @@ A `Group` is created for any `Mapping` in the AnnData object,
 including the standard `obsm`, `varm`, `layers`, and `uns`.
 Notably, this definition is used recursively within `uns`:
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
 ```python
->>> f["uns"].visititems(print)
+>>> store["uns"].visititems(print)
 [...]
-pca <HDF5 group "/uns/pca" (2 members)>
-pca/variance <HDF5 dataset "variance": shape (50,), type "<f4">
-pca/variance_ratio <HDF5 dataset "variance_ratio": shape (50,), type "<f4">
+pca <HDF5 group "/uns/pca" (3 members)>
+pca/variance <HDF5 dataset "variance": shape (50,), type "<f8">
+pca/variance_ratio <HDF5 dataset "variance_ratio": shape (50,), type "<f8">
 [...]
 ```
 
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
+```python
+>>> store["uns"].visititems(print)
+[...]
+pca <zarr.hierarchy.Group '/uns/pca' read-only>
+pca/variance <zarr.core.Array '/uns/pca/variance' (50,) float64 read-only>
+pca/variance_ratio <zarr.core.Array '/uns/pca/variance_ratio' (50,) float64 read-only>
+[...]
+```
+
+````
+
+`````
+
+
+
 ### Mapping specifications (v0.1.0)
 
 * Each mapping MUST be its own group
@@ -209,14 +326,40 @@ pca/variance_ratio <HDF5 dataset "variance_ratio": shape (50,), type "<f4">
 Zero dimensional arrays are used for scalar values (i.e. single values like strings, numbers or booleans).
 These should only occur inside of `uns`, and are commonly saved parameters:
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
 ```python
->>> f["uns/neighbors/params"].visititems(print)
+>>> store["uns/neighbors/params"].visititems(print)
 method <HDF5 dataset "method": shape (), type "|O">
 metric <HDF5 dataset "metric": shape (), type "|O">
 n_neighbors <HDF5 dataset "n_neighbors": shape (), type "<i8">
->>> f["uns/neighbors/params/metric"][()]
+random_state <HDF5 dataset "random_state": shape (), type "<i8">
+```
+
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
+```python
+>>> store["uns/neighbors/params"].visititems(print)
+method <zarr.core.Array '/uns/neighbors/params/method' () <U4 read-only>
+metric <zarr.core.Array '/uns/neighbors/params/metric' () <U9 read-only>
+n_neighbors <zarr.core.Array '/uns/neighbors/params/n_neighbors' () int64 read-only>
+random_state <zarr.core.Array '/uns/neighbors/params/random_state' () int64 read-only>
+```
+
+````
+
+`````
+
+```python
+>>> store["uns/neighbors/params/metric"][()]
 'euclidean'
->>> dict(f["uns/neighbors/params/metric"].attrs)
+>>> dict(store["uns/neighbors/params/metric"].attrs)
 {'encoding-type': 'string', 'encoding-version': '0.2.0'}
 ```
 
@@ -234,7 +377,7 @@ n_neighbors <HDF5 dataset "n_neighbors": shape (), type "<i8">
 ## Categorical arrays
 
 ```python
->>> categorical = f["obs"]["cell_type"]
+>>> categorical = store["obs"]["development_stage"]
 >>> dict(categorical.attrs)
 {'encoding-type': 'categorical', 'encoding-version': '0.2.0', 'ordered': False}
 ```
@@ -245,12 +388,32 @@ Each entry in the `codes` array is the zero-based index of the encoded value in
 To represent a missing value, a code of `-1` is used.
 We store these two arrays separately.
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
 ```python
 >>> categorical.visititems(print)
-categories <HDF5 dataset "categories": shape (22,), type "|O">
-codes <HDF5 dataset "codes": shape (38410,), type "|i1">
+categories <HDF5 dataset "categories": shape (7,), type "|O">
+codes <HDF5 dataset "codes": shape (164114,), type "|i1">
 ```
 
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
+```python
+>>> categorical.visititems(print)
+categories <zarr.core.Array '/obs/development_stage/categories' (7,) object read-only>
+codes <zarr.core.Array '/obs/development_stage/codes' (164114,) int8 read-only>
+```
+
+````
+
+`````
+
 ### Categorical array specification (v0.2.0)
 
 * Categorical arrays MUST be stored as a group
@@ -265,6 +428,30 @@ codes <HDF5 dataset "codes": shape (38410,), type "|i1">
 Arrays of strings are handled differently than numeric arrays since numpy doesn't really have a good way of representing arrays of unicode strings.
 `anndata` assumes strings are text-like data, so uses a variable length encoding.
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
+```python
+>>> store["var"][store["var"].attrs["_index"]]
+<HDF5 dataset "ensembl_id": shape (40145,), type "|O">
+```
+
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
+```python
+>>> store["var"][store["var"].attrs["_index"]]
+<zarr.core.Array '/var/ensembl_id' (40145,) object read-only>
+```
+
+````
+
+`````
+
 ```python
 >>> dict(categorical["categories"].attrs)
 {'encoding-type': 'string-array', 'encoding-version': '0.2.0'}
@@ -283,20 +470,56 @@ We support IO with Pandas nullable integer and boolean arrays.
 We represent these on disk similar to `numpy` masked arrays, `julia` nullable arrays, or `arrow` validity bitmaps (see {issue}`504` for more discussion).
 That is, we store an indicator array (or mask) of null values alongside the array of all values.
 
+`````{tab-set}
+
+````{tab-item} HDF5
+:sync: hdf5
+
+```python
+>>> from anndata.experimental import write_elem
+>>> null_store = h5py.File("tmp.h5", mode="w")
+>>> int_array = pd.array([1, None, 3, 4])
+>>> int_array
+<IntegerArray>
+[1, <NA>, 3, 4]
+Length: 4, dtype: Int64
+
+>>> write_elem(null_store, "nullable_integer", int_array)
+
+>>> null_store.visititems(print)
+nullable_integer <HDF5 group "/nullable_integer" (2 members)>
+nullable_integer/mask <HDF5 dataset "mask": shape (4,), type "|b1">
+nullable_integer/values <HDF5 dataset "values": shape (4,), type "<i8">
+```
+
+````
+
+````{tab-item} Zarr
+:sync: zarr
+
 ```python
->>> h5_file = h5py.File("anndata_format.h5", "a")
+>>> from anndata.experimental import write_elem
+>>> null_store = zarr.open()
 >>> int_array = pd.array([1, None, 3, 4])
 >>> int_array
 <IntegerArray>
 [1, <NA>, 3, 4]
 Length: 4, dtype: Int64
->>> write_elem(h5_file, "nullable_integer", int_array)
 
->>> h5_file["nullable_integer"].visititems(print)
-mask <HDF5 dataset "mask": shape (4,), type "|b1">
-values <HDF5 dataset "values": shape (4,), type "<i8">
+>>> write_elem(null_store, "nullable_integer", int_array)
 
->>> dict(h5_file["nullable_integer"].attrs)
+>>> null_store.visititems(print)
+nullable_integer <zarr.hierarchy.Group '/nullable_integer'>
+nullable_integer/mask <zarr.core.Array '/nullable_integer/mask' (4,) bool>
+nullable_integer/values <zarr.core.Array '/nullable_integer/values' (4,) int64>
+```
+
+````
+
+`````
+
+```python
+>>> dict(null_store["nullable_integer"].attrs)
 {'encoding-type': 'nullable-integer', 'encoding-version': '0.1.0'}
 ```
 
@@ -330,54 +553,80 @@ break down the awkward array into it’s constituent arrays using
 [`ak.to_buffers`](https://awkward-array.readthedocs.io/en/latest/_auto/ak.to_buffers.html)
 then writing these arrays using `anndata`’s methods.
 
-The container of arrays is stored in a group called `"container"`
+`````{tab-set}
 
+````{tab-item} HDF5
+:sync: hdf5
 
 ```python
->>> import zarr
->>> z = zarr.open("airr.zarr", "r")
->>> awkward_group = z["obsm/airr"]
->>> awkward_group.tree()
+>>> store["varm/transcript"].visititems(print)
+node1-mask <HDF5 dataset "node1-mask": shape (5019,), type "|u1">
+node10-data <HDF5 dataset "node10-data": shape (250541,), type "<i8">
+node11-mask <HDF5 dataset "node11-mask": shape (5019,), type "|u1">
+node12-offsets <HDF5 dataset "node12-offsets": shape (40146,), type "<i8">
+node13-mask <HDF5 dataset "node13-mask": shape (250541,), type "|i1">
+node14-data <HDF5 dataset "node14-data": shape (250541,), type "<i8">
+node16-offsets <HDF5 dataset "node16-offsets": shape (40146,), type "<i8">
+node17-data <HDF5 dataset "node17-data": shape (602175,), type "|u1">
+node2-offsets <HDF5 dataset "node2-offsets": shape (40146,), type "<i8">
+node3-data <HDF5 dataset "node3-data": shape (600915,), type "|u1">
+node4-mask <HDF5 dataset "node4-mask": shape (5019,), type "|u1">
+node5-offsets <HDF5 dataset "node5-offsets": shape (40146,), type "<i8">
+node6-data <HDF5 dataset "node6-data": shape (59335,), type "|u1">
+node7-mask <HDF5 dataset "node7-mask": shape (5019,), type "|u1">
+node8-offsets <HDF5 dataset "node8-offsets": shape (40146,), type "<i8">
+node9-mask <HDF5 dataset "node9-mask": shape (250541,), type "|i1">
 ```
 
-```
-airr
-    └── container
-        ├── node0-offsets (17,) int64
-        ├── node2-offsets (40,) int64
-        ├── node3-data (117,) uint8
-        ├── node4-offsets (40,) int64
-        └── node5-data (117,) uint8
-```
+````
 
-The length of the array is saved to it’s own `"length"` attribute,
-while metadata for the array structure is serialized and saved to the
-`“form”` attribute.
+````{tab-item} Zarr
+:sync: zarr
 
 ```python
->>> dict(awkward_group.attrs)
+>>> store["varm/transcript"].visititems(print)
+node1-mask <zarr.core.Array '/varm/transcript/node1-mask' (5019,) uint8 read-only>
+node10-data <zarr.core.Array '/varm/transcript/node10-data' (250541,) int64 read-only>
+node11-mask <zarr.core.Array '/varm/transcript/node11-mask' (5019,) uint8 read-only>
+node12-offsets <zarr.core.Array '/varm/transcript/node12-offsets' (40146,) int64 read-only>
+node13-mask <zarr.core.Array '/varm/transcript/node13-mask' (250541,) int8 read-only>
+node14-data <zarr.core.Array '/varm/transcript/node14-data' (250541,) int64 read-only>
+node16-offsets <zarr.core.Array '/varm/transcript/node16-offsets' (40146,) int64 read-only>
+node17-data <zarr.core.Array '/varm/transcript/node17-data' (602175,) uint8 read-only>
+node2-offsets <zarr.core.Array '/varm/transcript/node2-offsets' (40146,) int64 read-only>
+node3-data <zarr.core.Array '/varm/transcript/node3-data' (600915,) uint8 read-only>
+node4-mask <zarr.core.Array '/varm/transcript/node4-mask' (5019,) uint8 read-only>
+node5-offsets <zarr.core.Array '/varm/transcript/node5-offsets' (40146,) int64 read-only>
+node6-data <zarr.core.Array '/varm/transcript/node6-data' (59335,) uint8 read-only>
+node7-mask <zarr.core.Array '/varm/transcript/node7-mask' (5019,) uint8 read-only>
+node8-offsets <zarr.core.Array '/varm/transcript/node8-offsets' (40146,) int64 read-only>
+node9-mask <zarr.core.Array '/varm/transcript/node9-mask' (250541,) int8 read-only>
 ```
 
+````
+
+`````
+
+
+
+The length of the array is saved to it’s own `"length"` attribute,
+while metadata for the array structure is serialized and saved to the
+`“form”` attribute.
 
 ```python
-{
-    'encoding-type': 'awkward-array',
-    'encoding-version': '0.1.0',
-    'form': '{"class": "ListOffsetArray", "offsets": "i64", "content": {"class": '
-            '"RecordArray", "contents": {"locus": {"class": "ListOffsetArray", '
-            '"offsets": "i64", "content": {"class": "NumpyArray", "primitive": '
-            '"uint8", "inner_shape": [], "has_identifier": false, "parameters": '
-            '{"__array__": "char"}, "form_key": "node3"}, "has_identifier": '
-            'false, "parameters": {"__array__": "string"}, "form_key": "node2"}, '
-            '"junction_aa": {"class": "ListOffsetArray", "offsets": "i64", '
-            '"content": {"class": "NumpyArray", "primitive": "uint8", '
-            '"inner_shape": [], "has_identifier": false, "parameters": '
-            '{"__array__": "char"}, "form_key": "node5"}, "has_identifier": '
-            'false, "parameters": {"__array__": "string"}, "form_key": "node4"}}, '
-            '"has_identifier": false, "parameters": {}, "form_key": "node1"}, '
-            '"has_identifier": false, "parameters": {}, "form_key": "node0"}'
-    'length': 16
-}
+>>> dict(store["varm/transcript"].attrs)
+{'encoding-type': 'awkward-array',
+ 'encoding-version': '0.1.0',
+ 'form': '{"class": "RecordArray", "fields": ["tx_id", "seq_name", '
+         '"exon_seq_start", "exon_seq_end", "ensembl_id"], "contents": '
+         '[{"class": "BitMaskedArray", "mask": "u8", "valid_when": true, '
+         '"lsb_order": true, "content": {"class": "ListOffsetArray", '
+         '"offsets": "i64", "content": {"class": "NumpyArray", "primitive": '
+         '"uint8", "inner_shape": [], "parameters": {"__array__": "char"}, '
+         '"form_key": "node3"}, "parameters": {"__array__": "string"}, '
+         '"form_key": "node2"}, "parameters": {}, "form_key": "node1"}, '
+        ...
+ 'length': 40145}
 ```
 
 These can be read back as awkward arrays using the
@@ -387,15 +636,40 @@ function:
 ```python
 >>> import awkward as ak
 >>> from anndata.experimental import read_elem
+>>> awkward_group = store["varm/transcript"]
 >>> ak.from_buffers(
 ...     awkward_group.attrs["form"],
 ...     awkward_group.attrs["length"],
 ...     {k: read_elem(v) for k, v in awkward_group.items()}
 ... )
-```
-
-```
-<Array [[], [...], ..., [{locus: 'TRD', ...}]] type='16 * var * {locus: str...'>
+>>> transcript_models[:5]
+[{tx_id: 'ENST00000450305', seq_name: '1', exon_seq_start: [...], ...},
+ {tx_id: 'ENST00000488147', seq_name: '1', exon_seq_start: [...], ...},
+ {tx_id: 'ENST00000473358', seq_name: '1', exon_seq_start: [...], ...},
+ {tx_id: 'ENST00000477740', seq_name: '1', exon_seq_start: [...], ...},
+ {tx_id: 'ENST00000495576', seq_name: '1', exon_seq_start: [...], ...}]
+-----------------------------------------------------------------------
+type: 5 * {
+    tx_id: ?string,
+    seq_name: ?string,
+    exon_seq_start: option[var * ?int64],
+    exon_seq_end: option[var * ?int64],
+    ensembl_id: ?string
+}
+>>> transcript_models[0]
+{tx_id: 'ENST00000450305',
+ seq_name: '1',
+ exon_seq_start: [12010, 12179, 12613, 12975, 13221, 13453],
+ exon_seq_end: [12057, 12227, 12697, 13052, 13374, 13670],
+ ensembl_id: 'ENSG00000223972'}
+------------------------------------------------------------
+type: {
+    tx_id: ?string,
+    seq_name: ?string,
+    exon_seq_start: option[var * ?int64],
+    exon_seq_end: option[var * ?int64],
+    ensembl_id: ?string
+}
 ```
 
 
diff --git a/docs/release-notes/0.10.0.md b/docs/release-notes/0.10.0.md
index d0809f85f..d419eca57 100644
--- a/docs/release-notes/0.10.0.md
+++ b/docs/release-notes/0.10.0.md
@@ -36,6 +36,8 @@ We expect to make a full release by October.
 ```{rubric} Documentation
 ```
 
+* Added zarr examples to {doc}`file format docs</fileformat-prose>` {pr}`1162` {user}`ivirshup`
+
 ```{rubric} Breaking changes
 ```
 
diff --git a/pyproject.toml b/pyproject.toml
index 9a2f514ea..7bfbe496a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -75,6 +75,7 @@ doc = [
     "awkward>=2.0.7",
     "IPython",                          # For syntax highlighting in notebooks
     "myst_parser",
+    "sphinx_design>=0.5.0",
 ]
 test = [
     "loompy>=3.0.5",

From a4f34eb6716cfd77409c1541172352f52487698b Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Fri, 6 Oct 2023 13:40:39 +0200
Subject: [PATCH 3/5] Update release notes (#1165)

---
 docs/release-notes/0.10.0.md | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/docs/release-notes/0.10.0.md b/docs/release-notes/0.10.0.md
index d419eca57..01520e33c 100644
--- a/docs/release-notes/0.10.0.md
+++ b/docs/release-notes/0.10.0.md
@@ -1,14 +1,4 @@
-### 0.10.0rc1 {small}`2023-09-09`
-
-````{note}
-anndata 0.10.0 is currently available as a release candidate for testing. You can install this version of anndata with:
-
-```
-pip install -U --pre anndata
-```
-
-We expect to make a full release by October.
-````
+### 0.10.0 {small}`2023-10-06`
 
 ```{rubric} Features
 ```

From c6dcffd380cbeefbc8f6f1cefdb449c6f3965a01 Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Fri, 6 Oct 2023 14:12:37 +0200
Subject: [PATCH 4/5] Start 0.10.1 (#1166)

---
 docs/release-notes/0.10.1.md         | 10 ++++++++++
 docs/release-notes/release-latest.md |  3 +++
 2 files changed, 13 insertions(+)
 create mode 100644 docs/release-notes/0.10.1.md

diff --git a/docs/release-notes/0.10.1.md b/docs/release-notes/0.10.1.md
new file mode 100644
index 000000000..1b83f8906
--- /dev/null
+++ b/docs/release-notes/0.10.1.md
@@ -0,0 +1,10 @@
+### 0.10.1 {small}`the future`
+
+```{rubric} Bugfix
+```
+
+```{rubric} Documentation
+```
+
+```{rubric} Performance
+```
diff --git a/docs/release-notes/release-latest.md b/docs/release-notes/release-latest.md
index 0a89b1582..3203b03e7 100644
--- a/docs/release-notes/release-latest.md
+++ b/docs/release-notes/release-latest.md
@@ -1,4 +1,7 @@
 ## Version 0.10
 
+```{include} /release-notes/0.10.1.md
+```
+
 ```{include} /release-notes/0.10.0.md
 ```

From 6a969eb4696029716bc49dbe8686a9fb823f6e4f Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Fri, 6 Oct 2023 14:32:19 +0200
Subject: [PATCH 5/5] start 0.11 (#1168)

---
 docs/release-notes/0.11.0.md         | 13 +++++++++++++
 docs/release-notes/release-latest.md |  5 +++++
 2 files changed, 18 insertions(+)
 create mode 100644 docs/release-notes/0.11.0.md

diff --git a/docs/release-notes/0.11.0.md b/docs/release-notes/0.11.0.md
new file mode 100644
index 000000000..32aabe87a
--- /dev/null
+++ b/docs/release-notes/0.11.0.md
@@ -0,0 +1,13 @@
+### 0.11.0 {small}`the future`
+
+```{rubric} Features
+```
+
+```{rubric} Bugfix
+```
+
+```{rubric} Documentation
+```
+
+```{rubric} Performance
+```
diff --git a/docs/release-notes/release-latest.md b/docs/release-notes/release-latest.md
index 3203b03e7..5337aa78f 100644
--- a/docs/release-notes/release-latest.md
+++ b/docs/release-notes/release-latest.md
@@ -1,3 +1,8 @@
+## Version 0.11
+
+```{include} /release-notes/0.11.0.md
+```
+
 ## Version 0.10
 
 ```{include} /release-notes/0.10.1.md