Merge pull request #56 from janelia-cosem/use_pydantic_zarr

use pydantic zarr
janelia-cellmap · Aug 22, 2023 · 0b237e0 · 0b237e0
2 parents 0da50de + 3b4487b
commit 0b237e0
Show file tree

Hide file tree

Showing 9 changed files with 1,008 additions and 1,180 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,8 +26,8 @@ click = "^8.1.3"
 dask = "^2023.3.2"
 textual = "^0.16.0"
 aiohttp = "^3.8.4"
-httpx = {extras = ["http2"], version = "^0.23.3"}
 xarray-datatree = "^0.0.12"
+pydantic-zarr = "^0.5.0"
 
 
 [tool.poetry.group.dev.dependencies]

diff --git a/src/fibsem_tools/io/multiscale.py b/src/fibsem_tools/io/multiscale.py
@@ -1,20 +1,21 @@
 from __future__ import annotations
-from typing import Any, Dict, Literal, Optional, Sequence, Tuple, Union, List
+from typing import Any, Literal, Optional, Sequence, Tuple, Union, List
 
 from xarray import DataArray
 
 import zarr
-from fibsem_tools.io.core import AccessMode, create_group
-from fibsem_tools.metadata.cosem import COSEMGroupMetadataV1, COSEMGroupMetadataV2
-from fibsem_tools.metadata.neuroglancer import NeuroglancerN5GroupMetadata
-from fibsem_tools.metadata.transform import STTransform
-from zarr.errors import ContainsGroupError
+from fibsem_tools.metadata.cosem import (
+    CosemMultiscaleGroupV1,
+    CosemMultiscaleGroupV2,
+)
+from fibsem_tools.metadata.neuroglancer import (
+    NeuroglancerN5Group,
+)
 from numcodecs.abc import Codec
 from xarray_ome_ngff.registry import get_adapters
+from pydantic_zarr import GroupSpec, ArraySpec
 
 
-from fibsem_tools.io.util import Attrs, JSON
-
 NGFF_DEFAULT_VERSION = "0.4"
 multiscale_metadata_types = ["neuroglancer", "cellmap", "cosem", "ome-ngff"]
 
@@ -33,7 +34,8 @@ def _normalize_chunks(
             if all_ints:
                 result = (chunks,) * len(arrays)
             else:
-                raise ValueError(f"All values in chunks must be ints. Got {chunks}")
+                msg = f"All values in chunks must be ints. Got {chunks}"
+                raise ValueError(msg)
         except TypeError as e:
             raise e
 
@@ -44,56 +46,56 @@ def _normalize_chunks(
     return result
 
 
-def multiscale_metadata(
+def multiscale_group(
     arrays: Sequence[DataArray],
     metadata_types: List[str],
-    array_paths: Optional[List[str]] = None,
-) -> Tuple[Dict[str, JSON], List[Dict[str, JSON]]]:
+    array_paths: Union[List[str], Literal["auto"]] = "auto",
+    name: Optional[str] = None,
+    **kwargs,
+) -> GroupSpec:
     """
     Generate multiscale metadata of the desired flavor from a list of DataArrays
 
     Returns
     -------
 
-    A tuple of dicts with string keys and JSON-serializable values
+    A GroupSpec instance representing the multiscale group
 
     """
+    if array_paths == "auto":
+        array_paths = [f"s{idx}" for idx in range(len(arrays))]
     group_attrs = {}
-    array_attrs: List[Dict[str, Any]] = [{}] * len(arrays)
+    array_attrs = {path: {} for path in array_paths}
+
     if any(f.startswith("ome-ngff") for f in metadata_types) and any(
         f.startswith("cosem") for f in metadata_types
     ):
-        raise ValueError(
-            f"""
+        msg = f"""
         You requested {metadata_types}, but ome-ngff metadata and cosem metadata are 
         incompatible. Use just ome-ngff metadata instead.
         """
-        )
+        raise ValueError(msg)
 
     for flavor in metadata_types:
         flave, _, version = flavor.partition("@")
+
         if flave == "neuroglancer":
-            g_meta = NeuroglancerN5GroupMetadata.fromDataArrays(arrays)
-            group_attrs.update(g_meta.dict())
+            g_spec = NeuroglancerN5Group.from_xarrays(arrays, **kwargs)
+            group_attrs.update(g_spec.attrs.dict())
         elif flave == "cosem":
             if version == "2":
-                g_meta = COSEMGroupMetadataV2.fromDataArrays(arrays, array_paths)
+                g_spec = CosemMultiscaleGroupV2.from_xarrays(
+                    arrays, name=name, **kwargs
+                )
             else:
-                g_meta = COSEMGroupMetadataV1.fromDataArrays(arrays, array_paths)
-            group_attrs.update(g_meta.dict())
-            for idx in range(len(array_attrs)):
-                array_attrs[idx] = {
-                    "transform": STTransform.fromDataArray(arrays[idx]).dict(),
-                    **array_attrs[idx],
-                }
-        elif flave == "ome-ngff":
-            if array_paths is None:
-                raise ValueError(
-                    f"""
-                You requested {flave}-type metadata, but array_paths was set to None.
-                array_paths must be set to a list of strings to use this metadata.
-                """
+                g_spec = CosemMultiscaleGroupV1.from_xarrays(
+                    arrays, name=name, **kwargs
                 )
+            group_attrs.update(g_spec.attrs.dict())
+
+            for key, value in g_spec.items.items():
+                array_attrs[key].update(**value.attrs.dict())
+        elif flave == "ome-ngff":
             if version == "":
                 version = NGFF_DEFAULT_VERSION
             adapters = get_adapters(version)
@@ -109,54 +111,12 @@ def multiscale_metadata(
                 {multiscale_metadata_types}
                 """
             )
-    return group_attrs, array_attrs
-
+    members = {
+        path: ArraySpec.from_array(arr, attrs=array_attrs[path], **kwargs)
+        for arr, path in zip(arrays, array_paths)
+    }
 
-def multiscale_group(
-    url: str,
-    arrays: List[DataArray],
-    array_paths: List[str],
-    chunks: Tuple[Tuple[int, ...], ...] | Tuple[int, ...] | None,
-    metadata_types: List[str],
-    group_mode: AccessMode = "w-",
-    array_mode: AccessMode = "w-",
-    group_attrs: Attrs | None = None,
-    array_attrs: Sequence[Attrs] | None = None,
-    **kwargs: Any,
-) -> zarr.Group:
-
-    if array_attrs is None:
-        array_attrs = [{}] * len(arrays)
-    if group_attrs is None:
-        group_attrs = {}
-
-    mgroup_attrs, marray_attrs = multiscale_metadata(
-        arrays, metadata_types, array_paths=array_paths
-    )
-    _group_attrs = {**group_attrs, **mgroup_attrs}
-    _arr_attrs = [{**a, **m} for a, m in zip(array_attrs, marray_attrs)]
-
-    _chunks = _normalize_chunks(arrays, chunks)
-    try:
-        group = create_group(
-            url,
-            arrays,
-            array_paths=array_paths,
-            chunks=_chunks,
-            group_attrs=_group_attrs,
-            array_attrs=_arr_attrs,
-            group_mode=group_mode,
-            array_mode=array_mode,
-            **kwargs,
-        )
-        return group
-    except ContainsGroupError:
-        raise FileExistsError(
-            f"""
-            The resource at {url} resolves to an existing group. Use 'w' or 'a' 
-            access modes to enable writable / appendable access to this group.
-            """
-        )
+    return GroupSpec(attrs=group_attrs, members=members)
 
 
 def prepare_multiscale(

diff --git a/src/fibsem_tools/metadata/cosem.py b/src/fibsem_tools/metadata/cosem.py
@@ -1,8 +1,8 @@
-from typing import Optional, Sequence
+from typing import Iterable, Literal, Optional, Sequence, Union
 
 from pydantic import BaseModel
 from xarray import DataArray
-
+from pydantic_zarr import GroupSpec, ArraySpec
 from fibsem_tools.metadata.transform import STTransform
 
 
@@ -13,26 +13,26 @@ class ScaleMetaV1(BaseModel):
 
 class MultiscaleMetaV1(BaseModel):
     name: Optional[str]
-    datasets: Sequence[ScaleMetaV1]
+    datasets: list[ScaleMetaV1]
 
 
 class MultiscaleMetaV2(BaseModel):
     name: Optional[str]
-    datasets: Sequence[str]
+    datasets: list[str]
 
 
 class COSEMGroupMetadataV1(BaseModel):
     """
     Multiscale metadata used by COSEM for multiscale datasets saved in N5/Zarr groups.
     """
 
-    multiscales: Sequence[MultiscaleMetaV1]
+    multiscales: list[MultiscaleMetaV1]
 
     @classmethod
-    def fromDataArrays(
+    def from_xarrays(
         cls,
         arrays: Sequence[DataArray],
-        paths: Sequence[str],
+        paths: Union[Sequence[str], Literal["auto"]],
         name: Optional[str] = None,
     ):
         """
@@ -46,9 +46,10 @@ def fromDataArrays(
             arrays are assumed to share the same `dims` attributes, albeit with varying
             `coords`.
 
-        paths : list or tuple of str or None, default=None
+        paths : Sequence of str or the string literal 'auto', default='auto'
             The name on the storage backend for each of the arrays in the multiscale
-            collection.
+            collection. If 'auto', then names will be automatically generated using the
+            format s0, s1, s2, etc
 
         name : str, optional
             The name for the multiresolution collection
@@ -60,13 +61,14 @@ def fromDataArrays(
         COSEMGroupMetadata
         """
 
+        if paths == "auto":
+            paths = [f"s{idx}" for idx in range(len(arrays))]
+
         multiscales = [
             MultiscaleMetaV1(
                 name=name,
                 datasets=[
-                    ScaleMetaV1(
-                        path=path, transform=STTransform.fromDataArray(array=arr)
-                    )
+                    ScaleMetaV1(path=path, transform=STTransform.from_xarray(array=arr))
                     for path, arr in zip(paths, arrays)
                 ],
             )
@@ -79,13 +81,13 @@ class COSEMGroupMetadataV2(BaseModel):
     Multiscale metadata used by COSEM for multiscale datasets saved in N5/Zarr groups.
     """
 
-    multiscales: Sequence[MultiscaleMetaV2]
+    multiscales: list[MultiscaleMetaV2]
 
     @classmethod
-    def fromDataArrays(
+    def from_xarrays(
         cls,
         arrays: Sequence[DataArray],
-        paths: Sequence[str],
+        paths: Union[Sequence[str], Literal["auto"]] = "auto",
         name: Optional[str] = None,
     ):
         """
@@ -111,6 +113,8 @@ def fromDataArrays(
 
         COSEMGroupMetadata
         """
+        if paths == "auto":
+            paths = [f"s{idx}" for idx in enumerate(arrays)]
 
         multiscales = [
             MultiscaleMetaV2(
@@ -119,3 +123,68 @@ def fromDataArrays(
             )
         ]
         return cls(name=name, multiscales=multiscales, paths=paths)
+
+
+class CosemArrayAttrs(BaseModel):
+    transform: STTransform
+
+
+class CosemMultiscaleArray(ArraySpec):
+    attrs: CosemArrayAttrs
+
+    @classmethod
+    def from_xarray(cls, array: DataArray, **kwargs):
+        attrs = CosemArrayAttrs(transform=STTransform.from_xarray(array))
+        return super().from_array(array, attrs=attrs, **kwargs)
+
+
+class CosemMultiscaleGroupV1(GroupSpec):
+    attrs: COSEMGroupMetadataV1
+    items: dict[str, CosemMultiscaleArray]
+
+    @classmethod
+    def from_xarrays(
+        cls,
+        arrays: Iterable[DataArray],
+        paths: Union[Sequence[str], Literal["auto"]] = "auto",
+        name: Optional[str] = None,
+        **kwargs,
+    ):
+
+        if paths == "auto":
+            paths = [f"s{idx}" for idx in range(len(arrays))]
+
+        attrs = COSEMGroupMetadataV1.from_xarrays(arrays, paths, name)
+
+        array_specs = {
+            k: CosemMultiscaleArray.from_xarray(arr, **kwargs)
+            for k, arr in zip(paths, arrays)
+        }
+
+        return cls(attrs=attrs, items=array_specs)
+
+
+class CosemMultiscaleGroupV2(GroupSpec):
+    attrs: COSEMGroupMetadataV2
+    items: dict[str, ArraySpec[CosemArrayAttrs]]
+
+    @classmethod
+    def from_xarrays(
+        cls,
+        arrays: Iterable[DataArray],
+        paths: Union[Sequence[str], Literal["auto"]] = "auto",
+        name: Optional[str] = None,
+        **kwargs,
+    ):
+
+        if paths == "auto":
+            paths = [f"s{idx}" for idx in range(len(arrays))]
+
+        attrs = COSEMGroupMetadataV2.from_xarrays(arrays, paths, name)
+
+        array_specs = {
+            k: CosemMultiscaleArray.from_xarray(arr, **kwargs)
+            for k, arr in zip(paths, arrays)
+        }
+
+        return cls(attrs=attrs, items=array_specs)