Skip to content

Commit

Permalink
Merge pull request #463 from ljwoods2/main
Browse files Browse the repository at this point in the history
Translate h5py soft and hard linked datasets with an optional kwarg
  • Loading branch information
martindurant authored Jul 1, 2024
2 parents ae692fe + 3f7cacb commit 061bf98
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 8 deletions.
45 changes: 41 additions & 4 deletions kerchunk/hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,22 +113,39 @@ def __init__(
self.error = error
lggr.debug(f"HDF5 file URI: {self._uri}")

def translate(self):
def translate(self, preserve_linked_dsets=False):
"""Translate content of one HDF5 file into Zarr storage format.
This method is the main entry point to execute the workflow, and
returns a "reference" structure to be used with zarr/kerchunk
No data is copied out of the HDF5 file.
Parameters
----------
preserve_linked_dsets : bool (optional, default False)
If True, translate HDF5 soft and hard links for each `h5py.Dataset`
into the reference structure. Requires h5py version 3.11.0 or later.
Will not translate external links or links to `h5py.Group` objects.
Returns
-------
dict
Dictionary containing reference structure.
"""
lggr.debug("Translation begins")
self._transfer_attrs(self._h5f, self._zroot)

self._h5f.visititems(self._translator)

if preserve_linked_dsets:
if not has_visititems_links():
raise RuntimeError(
"'preserve_linked_dsets' kwarg requires h5py 3.11.0 or later "
f"is installed, found {h5py.__version__}"
)
self._h5f.visititems_links(self._translator)

if self.spec < 1:
return self.store
elif isinstance(self.store, LazyReferenceMapper):
Expand Down Expand Up @@ -247,10 +264,26 @@ def _decode_filters(self, h5obj: Union[h5py.Dataset, h5py.Group]):
)
return filters

def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
def _translator(
self,
name: str,
h5obj: Union[
h5py.Dataset, h5py.Group, h5py.SoftLink, h5py.HardLink, h5py.ExternalLink
],
):
"""Produce Zarr metadata for all groups and datasets in the HDF5 file."""
try: # method must not raise exception
kwargs = {}

if isinstance(h5obj, (h5py.SoftLink, h5py.HardLink)):
h5obj = self._h5f[name]
if isinstance(h5obj, h5py.Group):
# continues iteration of visititems_links
lggr.debug(
f"Skipping translation of HDF5 linked group: '{h5obj.name}'"
)
return None

if isinstance(h5obj, h5py.Dataset):
lggr.debug(f"HDF5 dataset: {h5obj.name}")
lggr.debug(f"HDF5 compression: {h5obj.compression}")
Expand Down Expand Up @@ -432,7 +465,7 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):
)

# Create a Zarr array equivalent to this HDF5 dataset...
za = self._zroot.create_dataset(
za = self._zroot.require_dataset(
h5obj.name,
shape=h5obj.shape,
dtype=dt or h5obj.dtype,
Expand Down Expand Up @@ -480,7 +513,7 @@ def _translator(self, name: str, h5obj: Union[h5py.Dataset, h5py.Group]):

elif isinstance(h5obj, h5py.Group):
lggr.debug(f"HDF5 group: {h5obj.name}")
zgrp = self._zroot.create_group(h5obj.name)
zgrp = self._zroot.require_group(h5obj.name)
self._transfer_attrs(h5obj, zgrp)
except Exception as e:
import traceback
Expand Down Expand Up @@ -639,3 +672,7 @@ def _is_netcdf_datetime(dataset: h5py.Dataset):

def _is_netcdf_variable(dataset: h5py.Dataset):
return any("_Netcdf4" in _ for _ in dataset.attrs)


def has_visititems_links():
return hasattr(h5py.Group, "visititems_links")
Binary file added kerchunk/tests/air_linked.nc
Binary file not shown.
35 changes: 31 additions & 4 deletions kerchunk/tests/test_hdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import pytest
import xarray as xr
import zarr
import h5py

from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.hdf import SingleHdf5ToZarr, has_visititems_links
from kerchunk.combine import MultiZarrToZarr, drop

here = osp.dirname(__file__)
Expand Down Expand Up @@ -92,9 +93,11 @@ def test_multizarr(generate_mzz):
assert set(ds) == set(expected)
for name in ds:
exp = {
k: (v.tolist() if v.size > 1 else v[0])
if isinstance(v, np.ndarray)
else v
k: (
(v.tolist() if v.size > 1 else v[0])
if isinstance(v, np.ndarray)
else v
)
for k, v in expected[name].attrs.items()
}
assert dict(ds[name].attrs) == dict(exp)
Expand Down Expand Up @@ -331,3 +334,27 @@ def test_inline_threshold():
fn, inline_threshold=1e9
).translate()
assert inline_0 != inline_1_million


@pytest.mark.skipif(
not has_visititems_links(),
reason="'h5py.Group.visititems_links' requires h5py 3.11.0 or later",
)
def test_translate_links():
fn = osp.join(here, "air_linked.nc")
# choose a threshold that will give both inline and non-inline
# datasets for maximum test coverage
out = kerchunk.hdf.SingleHdf5ToZarr(fn, inline_threshold=50).translate(
preserve_linked_dsets=True
)
fs = fsspec.filesystem("reference", fo=out)
z = zarr.open(fs.get_mapper())

# 1. Test the hard linked datasets were translated correctly
# 2. Test the soft linked datasets were translated correctly
for link in ("hard", "soft"):
for dset in ("lat", "time"):
np.testing.assert_allclose(z[dset], z[f"{dset}_{link}"])
for key in z[f"{dset}_{link}"].attrs.keys():
if key not in kerchunk.hdf._HIDDEN_ATTRS and key != "_ARRAY_DIMENSIONS":
assert z[f"{dset}_{link}"].attrs[key] == z[dset].attrs[key]

0 comments on commit 061bf98

Please sign in to comment.