Skip to content

Commit

Permalink
(style): change folder structure
Browse files Browse the repository at this point in the history
  • Loading branch information
ilan-gold committed Aug 8, 2023
1 parent 60b0ae6 commit 9d53307
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 118 deletions.
2 changes: 2 additions & 0 deletions anndata/experimental/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from anndata._io.specs import read_elem, write_elem, IOSpec
from ._dispatch_io import read_dispatched, write_dispatched
from .merge import concat_on_disk
from .backed._io import read_backed

__all__ = [
"AnnCollection",
Expand All @@ -16,4 +17,5 @@
"write_dispatched",
"IOSpec",
"concat_on_disk",
"read_backed",
]
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,34 +1,23 @@
from collections import OrderedDict, abc as cabc
from pathlib import Path
from typing import (
MutableMapping,
Optional,
Union,
Sequence,
Tuple,
)

import h5py
import numpy as np
from anndata._core.aligned_mapping import Layers, PairwiseArrays, AxisArrays
from anndata._core.anndata import StorageType, _check_2d_shape
from anndata._core.anndata import StorageType, _check_2d_shape, AnnData
from anndata._core.anndata_base import AbstractAnnData
from anndata._core.index import Index, _normalize_indices, _subset
from anndata._core.raw import Raw
from anndata._core.sparse_dataset import BaseCompressedSparseDataset, sparse_dataset
from anndata._core.sparse_dataset import BaseCompressedSparseDataset
from anndata._core.views import _resolve_idxs
from anndata._io.h5ad import read_dataset
from anndata.compat import DaskArray
from anndata.utils import convert_to_dict

import zarr
import xarray as xr
import pandas as pd
import dask.array as da
from ..._core import AnnData
from .. import read_dispatched
from .lazy_arrays import LazyCategoricalArray, LazyMaskedArray
from .xarray import Dataset2D

from ._xarray import Dataset2D


class AnnDataBacked(AbstractAnnData):
Expand Down Expand Up @@ -461,104 +450,3 @@ def __repr__(self):
if len(keys) > 0:
descr += f"\n {attr}: {str(list(keys))[1:-1]}"
return descr


def read_backed(
store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]
) -> AnnData:
"""Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned.
No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays.
Args:
store (Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]): A store-like object to be read in. If `zarr`, it is best
for it to be consolidated.
Returns:
AnnData: A lazily read-in AnnData object.
"""
is_h5 = False
if isinstance(store, Path) or isinstance(store, str):
store = str(store)
if store.endswith("h5ad"):
is_h5 = True

has_keys = True # true if consolidated or h5ad
if not is_h5:
try:
f = zarr.open_consolidated(store, mode="r")
except KeyError:
has_keys = False
f = zarr.open(store, mode="r")
else:
f = h5py.File(store, mode="r")

def callback(func, elem_name: str, elem, iospec):
if iospec.encoding_type == "anndata" or elem_name.endswith("/"):
cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"]
iter_object = (
elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem]
)
return AnnDataBacked(
**{k: read_dispatched(v, callback) for k, v in iter_object}, file=elem
)
elif elem_name.startswith("/raw"):
return None
elif iospec.encoding_type in {"dataframe"}:
iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [
(elem.attrs["_index"], elem[elem.attrs["_index"]])
]
d = {k: read_dispatched(v, callback) for k, v in iter_object}
d_with_xr = {}
index_label = f'{elem_name.replace("/", "")}_names'
for k in d:
v = d[k]
if type(v) == DaskArray and k != elem.attrs["_index"]:
d_with_xr[k] = xr.DataArray(
v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k
)
elif (
type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray
) and k != elem.attrs["_index"]:
d_with_xr[k] = xr.DataArray(
xr.core.indexing.LazilyIndexedArray(v),
coords=[d[elem.attrs["_index"]]],
dims=[index_label],
name=k,
)
elif k == elem.attrs["_index"]:
d_with_xr[index_label] = xr.DataArray(
v, coords=[v], dims=[index_label], name=index_label
)
else:
d_with_xr[k] = v
return Dataset2D(d_with_xr)
elif iospec.encoding_type == "categorical":
drop_unused_cats = not (
elem_name.startswith("/obsm") or elem_name.startswith("/varm")
)
return LazyCategoricalArray(
elem["codes"], elem["categories"], elem.attrs, drop_unused_cats
)
elif "nullable" in iospec.encoding_type:
return LazyMaskedArray(
elem["values"],
elem["mask"] if "mask" in elem else None,
iospec.encoding_type,
)
elif iospec.encoding_type in {"array", "string-array"}:
if is_h5:
if iospec.encoding_type == "string-array":
elem = read_dataset(elem)
if not hasattr(elem, "chunks") or elem.chunks is None:
return da.from_array(elem, chunks=(1000,) * len(elem.shape))
return da.from_array(elem)
return da.from_zarr(elem)
elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}:
return sparse_dataset(elem)
elif iospec.encoding_type in {"awkward-array"}:
return read_dispatched(elem, None)
return func(elem)

adata = read_dispatched(f, callback=callback)

return adata
119 changes: 119 additions & 0 deletions anndata/experimental/backed/_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from pathlib import Path
from typing import (
MutableMapping,
Union,
)

from anndata._core.sparse_dataset import sparse_dataset
from anndata._io.h5ad import read_dataset
from anndata.compat import DaskArray
import h5py
import zarr
import xarray as xr
import dask.array as da

from .. import read_dispatched
from ._lazy_arrays import LazyCategoricalArray, LazyMaskedArray
from ._xarray import Dataset2D
from ._anndata_backed import AnnDataBacked


def read_backed(
store: Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]
) -> AnnDataBacked:
"""Lazily read in on-disk/in-cloud AnnData stores. A new, but familiar, AnnData object will be returned.
No array data should need to be read into memory, with exception of non-obs/var dataframes and Awkward Arrays.
Args:
store (Union[str, Path, MutableMapping, zarr.Group, h5py.Dataset]): A store-like object to be read in. If `zarr`, it is best
for it to be consolidated.
Returns:
AnnDataBacked: A lazily read-in AnnData object.
"""
is_h5 = False
if isinstance(store, Path) or isinstance(store, str):
store = str(store)
if store.endswith("h5ad"):
is_h5 = True

has_keys = True # true if consolidated or h5ad
if not is_h5:
try:
f = zarr.open_consolidated(store, mode="r")
except KeyError:
has_keys = False
f = zarr.open(store, mode="r")
else:
f = h5py.File(store, mode="r")

def callback(func, elem_name: str, elem, iospec):
if iospec.encoding_type == "anndata" or elem_name.endswith("/"):
cols = ["obs", "var", "obsm", "varm", "obsp", "varp", "layers", "X", "raw"]
iter_object = (
elem.items() if has_keys else [(k, elem[k]) for k in cols if k in elem]
)
return AnnDataBacked(
**{k: read_dispatched(v, callback) for k, v in iter_object}, file=elem
)
elif elem_name.startswith("/raw"):
return None
elif iospec.encoding_type in {"dataframe"}:
iter_object = [(k, elem[k]) for k in elem.attrs["column-order"]] + [
(elem.attrs["_index"], elem[elem.attrs["_index"]])
]
d = {k: read_dispatched(v, callback) for k, v in iter_object}
d_with_xr = {}
index_label = f'{elem_name.replace("/", "")}_names'
for k in d:
v = d[k]
if type(v) == DaskArray and k != elem.attrs["_index"]:
d_with_xr[k] = xr.DataArray(
v, coords=[d[elem.attrs["_index"]]], dims=[index_label], name=k
)
elif (
type(v) == LazyCategoricalArray or type(v) == LazyMaskedArray
) and k != elem.attrs["_index"]:
d_with_xr[k] = xr.DataArray(
xr.core.indexing.LazilyIndexedArray(v),
coords=[d[elem.attrs["_index"]]],
dims=[index_label],
name=k,
)
elif k == elem.attrs["_index"]:
d_with_xr[index_label] = xr.DataArray(
v, coords=[v], dims=[index_label], name=index_label
)
else:
d_with_xr[k] = v
return Dataset2D(d_with_xr)
elif iospec.encoding_type == "categorical":
drop_unused_cats = not (
elem_name.startswith("/obsm") or elem_name.startswith("/varm")
)
return LazyCategoricalArray(
elem["codes"], elem["categories"], elem.attrs, drop_unused_cats
)
elif "nullable" in iospec.encoding_type:
return LazyMaskedArray(
elem["values"],
elem["mask"] if "mask" in elem else None,
iospec.encoding_type,
)
elif iospec.encoding_type in {"array", "string-array"}:
if is_h5:
if iospec.encoding_type == "string-array":
elem = read_dataset(elem)
if not hasattr(elem, "chunks") or elem.chunks is None:
return da.from_array(elem, chunks=(1000,) * len(elem.shape))
return da.from_array(elem)
return da.from_zarr(elem)
elif iospec.encoding_type in {"csr_matrix", "csc_matrix"}:
return sparse_dataset(elem)
elif iospec.encoding_type in {"awkward-array"}:
return read_dispatched(elem, None)
return func(elem)

adata = read_dispatched(f, callback=callback)

return adata
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions anndata/tests/test_read_backed_experimental.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@
gen_typed_df,
assert_equal,
)
from anndata.experimental.read_backed import (
read_backed,
from anndata.experimental import read_backed
from anndata.experimental.backed._lazy_arrays import (
LazyCategoricalArray,
LazyMaskedArray,
)
Expand Down

0 comments on commit 9d53307

Please sign in to comment.