diff --git a/Changelog.rst b/Changelog.rst index 4b1948b024..7a6bdf8505 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,13 +1,29 @@ -Version NEXTRELEASE +Version NEXTVERSION ------------------- **2024-??-??** * Upgrades to allow cfdm to work with Python 3.12 (https://github.com/NCAS-CMS/cfdm/issues/302) +* New function `cfdm.netcdf_flattener` that replaces the import of + `netcdf_flattener` (https://github.com/NCAS-CMS/cfdm/issues/286) +* New function `cfdm.netcdf_indexer` that applies netCDF masking and + unpacking to arbitrary arrays + (https://github.com/NCAS-CMS/cfdm/issues/285) +* Allow access to netCDF-4 files in S3 object stores + (https://github.com/NCAS-CMS/cfdm/issues/285) +* Allow a choice of netCDF engines + (https://github.com/NCAS-CMS/cfdm/issues/285) * Fix bug that caused `cfdm.write` to fail when a parametric Z dimension coordinate did not have a ``computed_standard_name`` attribute (https://github.com/NCAS-CMS/cfdm/issues/303) +* New class `cfdm.H5netcdfArray` +* New class `cfdm.NetCDF4Array` +* New dependency: ``h5netcdf>=1.3.0`` +* New dependency: ``h5py>=3.10.0`` +* New dependency: ``s3fs>=2024.6.0`` +* New dependency: ``dask>=2024.6.0`` +* Removed dependency: ``netcdf_flattener`` ---- diff --git a/README.md b/README.md index c028db3307..9bd31953ec 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,8 @@ inspecting it: The ``cfdm`` package can: -* read field and domain constructs from netCDF and CDL datasets, +* read field and domain constructs from netCDF and CDL datasets with a + choice of netCDF backends, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies, diff --git a/cfdm/__init__.py b/cfdm/__init__.py index c108e0d7ef..1382bb880b 100644 --- a/cfdm/__init__.py +++ b/cfdm/__init__.py @@ -48,16 +48,22 @@ __cf_version__ = core.__cf_version__ __version__ = core.__version__ -_requires = ("cftime", "netcdf_flattener", "scipy") +_requires = core._requires + ( + "cftime", + "netCDF4", + "scipy", + "h5netcdf", + "s3fs", +) _error0 = f"cfdm requires the modules {', '.join(_requires)}. " +# Check the version of cftime try: import cftime except ImportError as error1: raise ImportError(_error0 + str(error1)) -# Check the version of cftime _minimum_vn = "1.6.0" if Version(cftime.__version__) < Version(_minimum_vn): raise ValueError( @@ -65,32 +71,82 @@ f"Got {cftime.__version__} at {cftime.__file__}" ) +# Check the version of netCDF4 try: - import netcdf_flattener + import netCDF4 except ImportError as error1: raise ImportError(_error0 + str(error1)) -# Check the version of netcdf_flattener -_minimum_vn = "1.2.0" -if Version(netcdf_flattener.__version__) < Version(_minimum_vn): +_minimum_vn = "1.5.4" +if Version(netCDF4.__version__) < Version(_minimum_vn): raise ValueError( - f"Bad netcdf_flattener version: cfdm requires " - f"netcdf_flattener>={_minimum_vn}. Got {netcdf_flattener.__version__} " - f"at {netcdf_flattener.__file__}" + f"Bad netCDF4 version: cfdm requires netCDF4>={_minimum_vn}. " + f"Got {netCDF4.__version__} at {netCDF4.__file__}" ) +# Check the version of h5netcdf try: - import scipy + import h5netcdf except ImportError as error1: raise ImportError(_error0 + str(error1)) +_minimum_vn = "1.3.0" +if Version(h5netcdf.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad h5netcdf version: cfdm requires h5netcdf>={_minimum_vn}. " + f"Got {h5netcdf.__version__} at {h5netcdf.__file__}" + ) + +# Check the version of h5py +try: + import h5py +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + +_minimum_vn = "3.10.0" +if Version(h5py.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad h5py version: cfdm requires h5py>={_minimum_vn}. " + f"Got {h5py.__version__} at {h5py.__file__}" + ) + +# Check the version of s3fs +try: + import s3fs +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + +_minimum_vn = "2024.6.0" +if Version(s3fs.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad s3fs version: cfdm requires s3fs>={_minimum_vn}. " + f"Got {s3fs.__version__} at {s3fs.__file__}" + ) + # Check the version of scipy +try: + import scipy +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + _minimum_vn = "1.10.0" if Version(scipy.__version__) < Version(_minimum_vn): raise ValueError( - f"Bad scipy version: cfdm requires " - f"scipy>={_minimum_vn}. Got {scipy.__version__} " - f"at {scipy.__file__}" + f"Bad scipy version: cfdm requires scipy>={_minimum_vn}. " + f"Got {scipy.__version__} at {scipy.__file__}" + ) + +# Check the version of dask +try: + import dask +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + +_minimum_vn = "2024.6.0" +if Version(dask.__version__) < Version(_minimum_vn): + raise ValueError( + f"Bad scipy version: cfdm requires dask>={_minimum_vn}. " + f"Got {dask.__version__} at {dask.__file__}" ) from .constants import masked @@ -140,7 +196,10 @@ CompressedArray, Data, GatheredArray, + H5netcdfArray, NetCDFArray, + NetCDF4Array, + netcdf_indexer, NumpyArray, PointTopologyArray, RaggedArray, @@ -196,6 +255,7 @@ from .cfdmimplementation import CFDMImplementation, implementation from .read_write import read, write +from .read_write.netcdf.flatten import netcdf_flatten from .examplefield import example_field, example_fields, example_domain diff --git a/cfdm/abstract/implementation.py b/cfdm/abstract/implementation.py index 99547195ba..c168647105 100644 --- a/cfdm/abstract/implementation.py +++ b/cfdm/abstract/implementation.py @@ -54,7 +54,7 @@ def classes(self): 'Index', 'InteriorRing', 'List', - 'NetCDFArray', + 'NetCDF4Array', 'NodeCountProperties', 'PartNodeCountProperties', 'RaggedContiguousArray', diff --git a/cfdm/cfdmimplementation.py b/cfdm/cfdmimplementation.py index 6658b1761c..2e3b45ad27 100644 --- a/cfdm/cfdmimplementation.py +++ b/cfdm/cfdmimplementation.py @@ -30,7 +30,8 @@ CellConnectivityArray, Data, GatheredArray, - NetCDFArray, + H5netcdfArray, + NetCDF4Array, PointTopologyArray, RaggedContiguousArray, RaggedIndexedArray, @@ -1358,10 +1359,10 @@ def get_data_maximum(self, parent): :Returns: - Data instance + Scalar `Data` instance """ - return parent.data.maximum() + return parent.data.maximum(squeeze=True) def get_data_sum(self, parent): """Return the sum of the data. @@ -1372,10 +1373,10 @@ def get_data_sum(self, parent): :Returns: - Data instance + Scalar `Data` instance """ - return parent.data.sum() + return parent.data.sum(squeeze=True) def get_count(self, construct): """Return the count variable of compressed data. @@ -2291,67 +2292,41 @@ def initialise_TiePointIndex(self): cls = self.get_class("TiePointIndex") return cls() - def initialise_NetCDFArray( - self, - filename=None, - address=None, - dtype=None, - shape=None, - mask=True, - units=False, - calendar=None, - missing_values=None, - ): - """Return a netCDF array instance. + def initialise_NetCDF4Array(self, **kwargs): + """Return a `NetCDF4Array` instance. :Parameters: - filename: `str` - - address: `str` - - dytpe: `numpy.dtype` + kwargs: optional + Initialisation parameters to pass to the new instance. - shape: sequence of `int`, optional + .. versionadded:: (cfdm) NEXTVERSION - mask: `bool`, optional + :Returns: - units: `str` or `None` or False, optional - The units of the netCDF variable. Set to `None` to - indicate that there are no units. If False (the - default) then the units are considered unset. + `NetCDF4Array` - .. versionadded:: (cfdm) 1.10.0.2 + """ + cls = self.get_class("NetCDF4Array") + return cls(**kwargs) - calendar: `str` or `None`, optional - The calendar of the netCDF variable. By default, or if - set to `None`, then the CF default calendar is - assumed, if applicable. + def initialise_H5netcdfArray(self, **kwargs): + """Return a `H5netcdfArray` instance. - .. versionadded:: (cfdm) 1.10.0.2 + .. versionadded:: (cfdm) NEXTVERSION - missing_values: `dict`, optional - The missing value indicators defined by the netCDF - variable attributes. + :Parameters: - .. versionadded:: (cfdm) 1.10.0.3 + kwargs: optional + Initialisation parameters to pass to the new instance. :Returns: - `NetCDFArray` + `H5netcdfArray` """ - cls = self.get_class("NetCDFArray") - return cls( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - mask=mask, - units=units, - calendar=calendar, - missing_values=missing_values, - ) + cls = self.get_class("H5netcdfArray") + return cls(**kwargs) def initialise_BoundsFromNodesArray(self, **kwargs): """Return a node bounds array. @@ -3707,7 +3682,8 @@ def squeeze(self, construct, axes=None): Data=Data, BoundsFromNodesArray=BoundsFromNodesArray, GatheredArray=GatheredArray, - NetCDFArray=NetCDFArray, + H5netcdfArray=H5netcdfArray, + NetCDF4Array=NetCDF4Array, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, RaggedIndexedArray=RaggedIndexedArray, @@ -3750,7 +3726,8 @@ def implementation(): 'Datum': , 'Data': , 'GatheredArray': , - 'NetCDFArray': , + 'H5netcdfArray': , + 'NetCDF4Array': , 'PointTopologyArray': , 'RaggedContiguousArray': , 'RaggedIndexedArray': , diff --git a/cfdm/core/__init__.py b/cfdm/core/__init__.py index e6b52d7bbc..5e01c01368 100644 --- a/cfdm/core/__init__.py +++ b/cfdm/core/__init__.py @@ -11,9 +11,9 @@ """ -__date__ = "2024-03-01" +__date__ = "2024-??-??" __cf_version__ = "1.11" -__version__ = "1.11.1.0" +__version__ = "1.11.2.0" from packaging import __version__ as _packaging_ver from packaging import __file__ as _packaging_file @@ -21,20 +21,10 @@ import platform -_requires = ("numpy", "netCDF4", "packaging") +_requires = ("numpy", "packaging") _error0 = f"cfdm.core requires the modules {', '.join(_requires)}. " -try: - import netCDF4 -except ImportError as error1: - raise ImportError(_error0 + str(error1)) - -try: - import numpy as np -except ImportError as error1: - raise ImportError(_error0 + str(error1)) - # Check the version of python _minimum_vn = "3.8.0" if Version(platform.python_version()) < Version(_minimum_vn): @@ -44,22 +34,24 @@ ) # Check the version of packaging +try: + import packaging +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + _minimum_vn = "20.0" if Version(_packaging_ver) < Version(_minimum_vn): raise ValueError( - f"Bad packaging version: cfdm requires packaging>={_minimum_vn}. " + f"Bad packaging version: cfdm.core requires packaging>={_minimum_vn}. " f"Got {_packaging_ver} at {_packaging_file}" ) -# Check the version of netCDF4 -_minimum_vn = "1.5.4" -if Version(netCDF4.__version__) < Version(_minimum_vn): - raise ValueError( - f"Bad netCDF4 version: cfdm.core requires netCDF4>={_minimum_vn}. " - f"Got {netCDF4.__version__} at {netCDF4.__file__}" - ) - # Check the version of numpy +try: + import numpy as np +except ImportError as error1: + raise ImportError(_error0 + str(error1)) + _minimum_vn = "1.15" if Version(np.__version__) < Version(_minimum_vn): raise ValueError( diff --git a/cfdm/core/data/data.py b/cfdm/core/data/data.py index a08b5d8686..3056e3487d 100644 --- a/cfdm/core/data/data.py +++ b/cfdm/core/data/data.py @@ -894,7 +894,7 @@ def source(self, default=ValueError()): >>> f = {{package}}.read('file.nc')[0] >>> d = f.data >>> d.source() - <{{repr}}NetCDFArray(149, 182): file=file.nc variable=latitude> + <{{repr}}NetCDF4Array(149, 182): file=file.nc variable=latitude> """ return self._get_component("array", default=default) diff --git a/cfdm/core/functions.py b/cfdm/core/functions.py index 4038aa2ecf..65f5daa364 100644 --- a/cfdm/core/functions.py +++ b/cfdm/core/functions.py @@ -3,9 +3,6 @@ import sys from pickle import dumps, loads -import netCDF4 -import numpy as np - from . import __cf_version__, __file__, __version__ @@ -34,32 +31,24 @@ def environment(display=True, paths=True): **Examples** - >>> environment() - - Platform: Linux-5.14.0-1048-oem-x86_64-with-glibc2.31 - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 /home/user/miniconda3/bin/python - netCDF4: 1.6.0 /home/user/miniconda3/lib/python3.9/site-packages/netCDF4/__init__.py - numpy: 1.22.3 /home/user/miniconda3/lib/python3.9/site-packages/numpy/__init__.py - cfdm.core: 1.10.0.0 /home/user/miniconda3/lib/python3.9/site-packages/cfdm/core/__init__.py - - >>> environment(paths=False) - Platform: Linux-5.14.0-1048-oem-x86_64-with-glibc2.31 - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 - netCDF4: 1.6.0 - numpy: 1.22.3 - cfdm.core: 1.10.0.0 + >>> cfdm.core.environment(paths=False) + Platform: Linux-5.15.0-92-generic-x86_64-with-glibc2.35 + Python: 3.11.4 + packaging: 23.0 + numpy: 1.25.2 + cfdm.core: NEXTVERSION """ + import numpy as np + import packaging + dependency_version_paths_mapping = { "Platform": (platform.platform(), ""), - "HDF5 library": (netCDF4.__hdf5libversion__, ""), - "netcdf library": (netCDF4.__netcdf4libversion__, ""), "Python": (platform.python_version(), sys.executable), - "netCDF4": (netCDF4.__version__, os.path.abspath(netCDF4.__file__)), + "packaging": ( + packaging.__version__, + os.path.abspath(packaging.__file__), + ), "numpy": (np.__version__, os.path.abspath(np.__file__)), "cfdm.core": (__version__, os.path.abspath(__file__)), } diff --git a/cfdm/data/__init__.py b/cfdm/data/__init__.py index 23641aa370..22a7835398 100644 --- a/cfdm/data/__init__.py +++ b/cfdm/data/__init__.py @@ -18,7 +18,10 @@ from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray from .gatheredarray import GatheredArray +from .h5netcdfarray import H5netcdfArray from .netcdfarray import NetCDFArray +from .netcdf4array import NetCDF4Array +from .netcdfindexer import netcdf_indexer from .numpyarray import NumpyArray from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray diff --git a/cfdm/data/abstract/compressedarray.py b/cfdm/data/abstract/compressedarray.py index da5b1e67e3..c8c387c86f 100644 --- a/cfdm/data/abstract/compressedarray.py +++ b/cfdm/data/abstract/compressedarray.py @@ -1,5 +1,6 @@ import numpy as np +from ..netcdfindexer import netcdf_indexer from .array import Array @@ -189,10 +190,15 @@ def __getitem__(self, indices): ) u[u_indices] = subarray[...] - if indices is Ellipsis: - return u - - return self.get_subspace(u, indices, copy=True) + u = netcdf_indexer( + u, + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) + return u[indices] def _first_or_last_element(self, indices): """Return the first or last element of the compressed array. diff --git a/cfdm/data/abstract/mesharray.py b/cfdm/data/abstract/mesharray.py index 0599540f80..0c7831c424 100644 --- a/cfdm/data/abstract/mesharray.py +++ b/cfdm/data/abstract/mesharray.py @@ -3,6 +3,7 @@ import numpy as np +from ..netcdfindexer import netcdf_indexer from .compressedarray import CompressedArray @@ -148,10 +149,15 @@ def __getitem__(self, indices): # future reference. self._set_component("shape", u.shape, copy=False) - if indices is Ellipsis: - return u - - return self.get_subspace(u, indices, copy=False) + u = netcdf_indexer( + u, + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) + return u[indices] @property def dtype(self): diff --git a/cfdm/data/data.py b/cfdm/data/data.py index e14f7c6cd8..56f1db18b5 100644 --- a/cfdm/data/data.py +++ b/cfdm/data/data.py @@ -2194,7 +2194,7 @@ def _parse_indices(self, indices): return parsed_indices - def maximum(self, axes=None): + def maximum(self, axes=None, squeeze=False): """Return the maximum of an array or the maximum along axes. Missing data array elements are omitted from the calculation. @@ -2211,6 +2211,14 @@ def maximum(self, axes=None): {{axes int examples}} + squeeze: `bool`, optional + If this is set to False, the default, the axes which + are reduced are left in the result as dimensions with + size one. With this option, the result will broadcast + correctly against the original data. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `{{class}}` @@ -2255,7 +2263,7 @@ def maximum(self, axes=None): raise ValueError(f"Can't find maximum of data: {error}") array = self.array - array = np.amax(array, axis=axes, keepdims=True) + array = np.amax(array, axis=axes, keepdims=not squeeze) out = self.copy(array=False) out._set_Array(array, copy=False) @@ -2414,7 +2422,7 @@ def squeeze(self, axes=None, inplace=False): return d - def sum(self, axes=None): + def sum(self, axes=None, squeeze=False): """Return the sum of an array or the sum along axes. Missing data array elements are omitted from the calculation. @@ -2429,6 +2437,14 @@ def sum(self, axes=None): {{axes int examples}} + squeeze: `bool`, optional + If this is set to False, the default, the axes which + are reduced are left in the result as dimensions with + size one. With this option, the result will broadcast + correctly against the original data. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `{{class}}` @@ -2471,8 +2487,9 @@ def sum(self, axes=None): axes = self._parse_axes(axes) except ValueError as error: raise ValueError(f"Can't sum data: {error}") + array = self.array - array = np.sum(array, axis=axes, keepdims=True) + array = np.sum(array, axis=axes, keepdims=not squeeze) d = self.copy(array=False) d._set_Array(array, copy=False) diff --git a/cfdm/data/h5netcdfarray.py b/cfdm/data/h5netcdfarray.py new file mode 100644 index 0000000000..b2dd4d2dc7 --- /dev/null +++ b/cfdm/data/h5netcdfarray.py @@ -0,0 +1,287 @@ +import logging + +import h5netcdf +import netCDF4 + +from . import abstract +from .mixin import FileArrayMixin, NetCDFFileMixin +from .netcdfindexer import netcdf_indexer + +_safecast = netCDF4.utils._safecast +default_fillvals = netCDF4.default_fillvals.copy() +default_fillvals["O"] = default_fillvals["S1"] + +logger = logging.getLogger(__name__) + + +class H5netcdfArray(NetCDFFileMixin, FileArrayMixin, abstract.Array): + """A netCDF array accessed with `h5netcdf`. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + mask=True, + unpack=True, + attributes=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the file(s) containing the array. + + address: (sequence of) `str`, optional + The identity of the variable in each file defined by + *filename*. Must be a netCDF variable name. + + dtype: `numpy.dtype` + The data type of the array in the file. May be `None` + if the numpy data-type is not known (which can be the + case for string types, for example). + + shape: `tuple` + The array dimension sizes in the file. + + {{init mask: `bool`, optional}} + + {{init unpack: `bool`, optional}} + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + .. versionadded:: (cfdm) NEXTVERSION + + {{init storage_options: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + mask = source._get_component("mask", True) + except AttributeError: + mask = True + + try: + unpack = source._get_component("unpack", True) + except AttributeError: + unpack = True + + try: + attributes = source._get_component("attributes", None) + except AttributeError: + attributes = None + + try: + storage_options = source._get_component( + "storage_options", None + ) + except AttributeError: + storage_options = None + + if shape is not None: + self._set_component("shape", shape, copy=False) + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, (str, int)): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + self._set_component("dtype", dtype, copy=False) + self._set_component("mask", bool(mask), copy=False) + self._set_component("unpack", bool(unpack), copy=False) + self._set_component("storage_options", storage_options, copy=False) + self._set_component("attributes", attributes, copy=False) + + # By default, close the file after data array access + self._set_component("close", True, copy=False) + + def __getitem__(self, indices): + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + .. versionadded:: (cfdm) NEXTVERSION + + """ + dataset, address = self.open() + dataset0 = dataset + + groups, address = self.get_groups(address) + if groups: + dataset = self._group(dataset, groups) + + # Get the variable by netCDF name + variable = dataset.variables[address] + + # Get the data, applying masking and scaling as required. + array = netcdf_indexer( + variable, + mask=self.get_mask(), + unpack=self.get_unpack(), + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) + array = array[indices] + + # Set the attributes, if they haven't been set already. + self._set_attributes(variable) + + self.close(dataset0) + del dataset, dataset0 + + return array + + def _set_attributes(self, var): + """Set the netCDF variable attributes. + + These are set from the netCDF variable attributes, but only if + they have not already been defined, either during `{{class}}` + instantiation or by a previous call to `_set_attributes`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: `h5netcdf.Variable` + The netCDF variable. + + :Returns: + + `None` + + """ + if self._get_component("attributes", None) is not None: + return + + self._set_component("attributes", dict(var.attrs), copy=False) + + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `h5netcdf.File` + The netCDF dataset to be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + + def get_groups(self, address): + """The netCDF4 group structure of a netCDF variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + address: `str` or `int` + The netCDF variable name, or integer varid, from which + to get the groups. + + :Returns: + + (`list`, `str`) or (`list`, `int`) + The group structure and the name within the group. If + *address* is a varid then an empty list and the varid + are returned. + + **Examples** + + >>> n.get_groups('tas') + ([], 'tas') + + >>> n.get_groups('/tas') + ([], 'tas') + + >>> n.get_groups('/data/model/tas') + (['data', 'model'], 'tas') + + >>> n.get_groups(9) + ([], 9) + + """ + try: + if "/" not in address: + return [], address + except TypeError: + return [], address + + out = address.split("/")[1:] + return out[:-1], out[-1] + + def open(self, **kwargs): + """Return a dataset file object and address. + + When multiple files have been provided an attempt is made to + open each one, in the order stored, and a file object is + returned from the first file that exists. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + (`h5netcdf.File`, `str`) + The open file object, and the address of the data + within the file. + + """ + return super().open( + h5netcdf.File, mode="r", decode_vlen_strings=True, **kwargs + ) diff --git a/cfdm/data/mixin/__init__.py b/cfdm/data/mixin/__init__.py index e5dba5957c..fffb784f0a 100644 --- a/cfdm/data/mixin/__init__.py +++ b/cfdm/data/mixin/__init__.py @@ -1,2 +1,3 @@ from .arraymixin import ArrayMixin from .filearraymixin import FileArrayMixin +from .netcdffilemixin import NetCDFFileMixin diff --git a/cfdm/data/mixin/arraymixin.py b/cfdm/data/mixin/arraymixin.py index 9010877444..f3b1f8e4e9 100644 --- a/cfdm/data/mixin/arraymixin.py +++ b/cfdm/data/mixin/arraymixin.py @@ -1,4 +1,4 @@ -import numpy as np +from copy import deepcopy class ArrayMixin: @@ -76,30 +76,35 @@ def __docstring_package_depth__(self): """ return 0 - def _set_units(self): - """The units and calendar properties. + def get_attributes(self, default=ValueError()): + """The attributes of the array. - These are the values set during initialisation, defaulting to - `None` if either was not set at that time. + .. versionadded:: (cfdm) NEXTVERSION - .. versionadded:: (cfdm) 1.10.1.0 + :Parameters: + + default: optional + Return the value of the *default* parameter if the + attributes have not been set. If set to an `Exception` + instance then it will be raised instead. :Returns: - `tuple` - The units and calendar values, either of which may be - `None`. + `dict` + The attributes. """ - units = self.get_units(False) - if units is False: - self._set_component("units", None, copy=False) + attributes = self._get_component("attributes", None) + if attributes is None: + if default is None: + return - calendar = self.get_calendar(False) - if calendar is False: - self._set_component("calendar", None, copy=False) + return self._default( + default, + f"{self.__class__.__name__} attributes have not yet been set", + ) - return units, calendar + return deepcopy(attributes) def get_calendar(self, default=ValueError()): """The calendar of the array. @@ -122,8 +127,8 @@ def get_calendar(self, default=ValueError()): The calendar value. """ - calendar = self._get_component("calendar", False) - if calendar is False: + attributes = self.get_attributes({}) + if "calendar" not in attributes: if default is None: return @@ -132,7 +137,7 @@ def get_calendar(self, default=ValueError()): f"{self.__class__.__name__} 'calendar' has not been set", ) - return calendar + return attributes["calendar"] def get_compression_type(self): """Returns the array's compression type. @@ -162,112 +167,6 @@ def get_compression_type(self): """ return self._get_component("compression_type", "") - @classmethod - def get_subspace(cls, array, indices, copy=True): - """Return a subspace, defined by indices, of a numpy array. - - Only certain type of indices are allowed. See the *indices* - parameter for details. - - Indexing is similar to numpy indexing. Given the restrictions on - the type of indices allowed - see the *indicies* parameter - the - only difference to numpy indexing is - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - - .. versionadded:: (cfdm) 1.8.7.0 - - :Parameters: - - array: `numpy.ndarray` - The array to be subspaced. - - indices: - The indices that define the subspace. - - Must be either `Ellipsis` or a sequence that contains an - index for each dimension. In the latter case, each - dimension's index must either be a `slice` object or a - sequence of two or more integers. - - *Parameter example:* - indices=Ellipsis - - *Parameter example:* - indices=[[5, 7, 8]] - - *Parameter example:* - indices=[slice(4, 7)] - - *Parameter example:* - indices=[slice(None), [5, 7, 8]] - - *Parameter example:* - indices=[[2, 5, 6], slice(15, 4, -2), [8, 7, 5]] - - copy: `bool` - If `False` then the returned subspace may (or may not) be - independent of the input *array*. By default the returned - subspace is independent of the input *array*. - - :Returns: - - `numpy.ndarray` - - """ - if indices is not Ellipsis: - if not isinstance(indices, tuple): - indices = (indices,) - - axes_with_list_indices = [ - i for i, x in enumerate(indices) if not isinstance(x, slice) - ] - n_axes_with_list_indices = len(axes_with_list_indices) - - if n_axes_with_list_indices < 2: - # ---------------------------------------------------- - # At most one axis has a list-of-integers index so we - # can do a normal numpy subspace - # ---------------------------------------------------- - array = array[tuple(indices)] - else: - # ---------------------------------------------------- - # At least two axes have list-of-integers indices so - # we can't do a normal numpy subspace - # ---------------------------------------------------- - n_indices = len(indices) - if n_axes_with_list_indices < n_indices: - # Apply subspace defined by slices - slices = [ - i if isinstance(i, slice) else slice(None) - for i in indices - ] - array = array[tuple(slices)] - - if n_axes_with_list_indices: - # Apply subspaces defined by lists (this - # methodology works for both numpy arrays and - # scipy sparse arrays). - lists = [slice(None)] * n_indices - for axis in axes_with_list_indices: - lists[axis] = indices[axis] - array = array[tuple(lists)] - lists[axis] = slice(None) - - if copy: - if np.ma.isMA(array) and not array.ndim: - # This is because numpy.ma.copy doesn't work for - # scalar arrays (at the moment, at least) - ma_array = np.ma.empty((), dtype=array.dtype) - ma_array[...] = array - array = ma_array - else: - array = array.copy() - - return array - def get_units(self, default=ValueError()): """The units of the array. @@ -290,8 +189,8 @@ def get_units(self, default=ValueError()): The units value. """ - units = self._get_component("units", False) - if units is False: + attributes = self.get_attributes({}) + if "units" not in attributes: if default is None: return @@ -300,4 +199,4 @@ def get_units(self, default=ValueError()): f"{self.__class__.__name__} 'units' have not been set", ) - return units + return attributes["units"] diff --git a/cfdm/data/mixin/filearraymixin.py b/cfdm/data/mixin/filearraymixin.py index 1655161651..f90a426730 100644 --- a/cfdm/data/mixin/filearraymixin.py +++ b/cfdm/data/mixin/filearraymixin.py @@ -1,8 +1,17 @@ +from copy import deepcopy from urllib.parse import urlparse +from s3fs import S3FileSystem + from ...functions import abspath +class DeprecationError(Exception): + """Deprecation error.""" + + pass + + class FileArrayMixin: """Mixin class for a file container of an array. @@ -10,6 +19,14 @@ class FileArrayMixin: """ + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return f"<{self.__class__.__name__}{self.shape}: {self}>" + def __str__(self): """Called by the `str` built-in function. @@ -63,7 +80,8 @@ def get_address(self, default=AttributeError()): """ addresses = self.get_addresses() - if len(addresses) == 1: + n = len(addresses) + if n == 1: return addresses[0] if default is None: @@ -176,11 +194,106 @@ def get_formats(self): """ return (self.get_format(),) * len(self.get_filenames()) + def get_missing_values(self): + """The missing values of the data. + + Deprecated at version NEXTVERSION. Use `get_attributes` instead. + + """ + raise DeprecationError( + f"{self.__class__.__name__}.get_missing_values was deprecated " + "at version NEXTVERSION and is no longer available. " + f"Use {self.__class__.__name__}.get_attributes instead." + ) # pragma: no cover + + def get_storage_options( + self, create_endpoint_url=True, filename=None, parsed_filename=None + ): + """Return `s3fs.S3FileSystem` options for accessing S3 files. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + create_endpoint_url: `bool`, optional + If True, the default, then create an + ``'endpoint_url'`` option if and only if one has not + already been provided. See *filename* and + *parsed_filename* for details. + + filename: `str`, optional + Used to set the ``'endpoint_url'`` option if it has + not been previously defined. Ignored if + *parsed_filename* has been set. + + parsed_filename: `urllib.parse.ParseResult`, optional + Used to set the ``'endpoint_url'`` option if it has + not been previously defined. By default the + ``'endpoint_url'`` option, if required, is set from + the file name returned by `get_filename`. + + :Returns: + + `dict` or `None` + The `s3fs.S3FileSystem` options. + + **Examples** + + >>> f.get_filename() + 's3://store/data/file.nc' + >>> f.get_storage_options(create_endpoint_url=False) + {} + >>> f.get_storage_options() + {'endpoint_url': 'https://store'} + >>> f.get_storage_options(filename='s3://other-store/data/file.nc') + {'endpoint_url': 'https://other-store'} + >>> f.get_storage_options(create_endpoint_url=False, + ... filename='s3://other-store/data/file.nc') + {} + + >>> f.get_storage_options() + {'key': 'scaleway-api-key...', + 'secret': 'scaleway-secretkey...', + 'endpoint_url': 'https://s3.fr-par.scw.cloud', + 'client_kwargs': {'region_name': 'fr-par'}} + + """ + storage_options = self._get_component("storage_options", None) + if not storage_options: + storage_options = {} + else: + storage_options = deepcopy(storage_options) + + client_kwargs = storage_options.get("client_kwargs", {}) + if ( + create_endpoint_url + and "endpoint_url" not in storage_options + and "endpoint_url" not in client_kwargs + ): + if parsed_filename is None: + if filename is None: + try: + filename = self.get_filename() + except AttributeError: + pass + else: + parsed_filename = urlparse(filename) + else: + parsed_filename = urlparse(filename) + + if parsed_filename is not None and parsed_filename.scheme == "s3": + # Derive endpoint_url from filename + storage_options["endpoint_url"] = ( + f"https://{parsed_filename.netloc}" + ) + + return storage_options + def open(self, func, *args, **kwargs): - """Return an open file object containing the data array. + """Return a dataset file object and address. When multiple files have been provided an attempt is made to - open each one, in the order stored, and an open file object is + open each one, in the order stored, and a file object is returned from the first file that exists. .. versionadded:: (cfdm) 1.10.1.0 @@ -195,9 +308,9 @@ def open(self, func, *args, **kwargs): :Returns: - `tuple` - The open file object, and the address of the data - within the file. + 2-`tuple` + The file object for the dataset, and the address of + the data within the file. """ # Loop round the files, returning as soon as we find one that @@ -208,17 +321,25 @@ def open(self, func, *args, **kwargs): if url.scheme == "file": # Convert a file URI into an absolute path filename = url.path + elif url.scheme == "s3": + # Create an openable S3 file object + storage_options = self.get_storage_options( + create_endpoint_url=True, parsed_filename=url + ) + fs = S3FileSystem(**storage_options) + filename = fs.open(url.path[1:], "rb") try: - nc = func(filename, *args, **kwargs) + dataset = func(filename, *args, **kwargs) except FileNotFoundError: continue except RuntimeError as error: raise RuntimeError(f"{error}: {filename}") - return nc, address + # Successfully opened a dataset, so return. + return dataset, address if len(filenames) == 1: - raise FileNotFoundError(f"No such netCDF file: {filenames[0]}") + raise FileNotFoundError(f"No such file: {filenames[0]}") - raise FileNotFoundError(f"No such netCDF files: {filenames}") + raise FileNotFoundError(f"No such files: {filenames}") diff --git a/cfdm/data/mixin/netcdffilemixin.py b/cfdm/data/mixin/netcdffilemixin.py new file mode 100644 index 0000000000..0e0f747b91 --- /dev/null +++ b/cfdm/data/mixin/netcdffilemixin.py @@ -0,0 +1,211 @@ +from ..numpyarray import NumpyArray + + +class DeprecationError(Exception): + """Deprecation error.""" + + pass + + +class NetCDFFileMixin: + """Mixin class for netCDF file arrays. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def _group(self, dataset, groups): + """Return the group object containing a variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: `netCDF4.Dataset` or `h5netcdf.File` + The dataset containing the variable. + + groups: sequence of `str` + The definition of which group the variable is in. For + instance, of the variable is in group + ``/forecast/model`` then *groups* would be + ``['forecast', 'model']``. + + :Returns: + + `netCDF4.Dataset` or `netCDF4.Group` + or `h5netcdf.File` or `h5netcdf.Group` + The group object, which might be the root group. + + """ + for g in groups: + dataset = dataset.groups[g] + + return dataset + + def _set_attributes(self, var): + """Set the netCDF variable attributes. + + These are set from the netCDF variable attributes, but only if + they have not already been defined, either during {{class}} + instantiation or by a previous call to `_set_attributes`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The netCDF variable. + + :Returns: + + `dict` + The attributes. + + """ + raise NotImplementedError( + f"Must implement {self.__class__.__name__}._set_attributes" + ) # pragma: no cover + + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + + **Examples** + + >>> n = numpy.asanyarray(a) + >>> isinstance(n, numpy.ndarray) + True + + """ + return self[...] + + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Parameters: + + dataset: + The dataset to be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + + def get_format(self): + """The format of the files. + + .. versionadded:: (cfdm) 1.10.1.0 + + .. seealso:: `get_address`, `get_filename`, `get_formats` + + :Returns: + + `str` + The file format. Always ``'nc'``, signifying netCDF. + + **Examples** + + >>> a.get_format() + 'nc' + + """ + return "nc" + + def get_mask(self): + """Whether or not to automatically mask the data. + + .. versionadded:: (cfdm) 1.8.2 + + **Examples** + + >>> b = a.get_mask() + + """ + return self._get_component("mask") + + def get_missing_values(self, default=ValueError()): + """The missing value indicators from the netCDF variable. + + Deprecated at version NEXTVERSION. Use `get_attributes` instead. + + .. versionadded:: (cfdm) 1.10.0.3 + + :Parameters: + + default: optional + Return the value of the *default* parameter if no missing + values have yet been defined. + + {{default Exception}} + + :Returns: + + `dict` or `None` + The missing value indicators from the netCDF variable, + keyed by their netCDF attribute names. An empty + dictionary signifies that no missing values are given + in the file. `None` signifies that the missing values + have not been set. + + **Examples** + + >>> a.get_missing_values(None) + None + + >>> b.get_missing_values({}) + {} + + >>> b.get_missing_values() + {} + + >>> c.get_missing_values() + {'missing_value': 1e20, 'valid_range': (-10, 20)} + + >>> d.get_missing_values() + {'valid_min': -999} + + """ + raise DeprecationError( + f"{self.__class__.__name__}.get_missing_values was deprecated " + "at version NEXTVERSION and is no longer available. " + f"Use {self.__class__.__name__}.get_attributes instead." + ) + + def get_unpack(self): + """Whether or not to automatically unpack the data. + + .. versionadded:: (cfdm) NEXTVERSION + + **Examples** + + >>> a.get_unpack() + True + + """ + return self._get_component("unpack") + + def to_memory(self): + """Bring data on disk into memory. + + .. versionadded:: (cfdm) 1.7.0 + + :Returns: + + `NumpyArray` + The new array with all of its data in memory. + + """ + return NumpyArray(self[...]) diff --git a/cfdm/data/netcdf4array.py b/cfdm/data/netcdf4array.py new file mode 100644 index 0000000000..e76a0d5d5f --- /dev/null +++ b/cfdm/data/netcdf4array.py @@ -0,0 +1,353 @@ +import netCDF4 + +from . import abstract +from .mixin import FileArrayMixin, NetCDFFileMixin +from .netcdfindexer import netcdf_indexer + + +class NetCDF4Array(NetCDFFileMixin, FileArrayMixin, abstract.Array): + """A netCDF array accessed with `netCDF4`. + + .. versionadded:: (cfdm) 1.7.0 + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + mask=True, + unpack=True, + attributes=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of) `str`, optional + The name of the netCDF file(s) containing the array. + + address: (sequence of) `str` or `int`, optional + The identity of the netCDF variable in each file + defined by *filename*. Either a netCDF variable name + or an integer netCDF variable ID. + + .. versionadded:: (cfdm) 1.10.1.0 + + dtype: `numpy.dtype` + The data type of the array in the netCDF file. May be + `None` if the numpy data-type is not known (which can be + the case for netCDF string types, for example). + + shape: `tuple` + The array dimension sizes in the netCDF file. + {{init mask: `bool`, optional}} + + .. versionadded:: (cfdm) 1.8.2 + + {{init unpack: `bool`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + .. versionadded:: (cfdm) NEXTVERSION + + {{init storage_options: `dict` or `None`, optional}} + + .. versionadded:: (cfdm) NEXTVERSION + + {{init source: optional}} + + .. versionadded:: (cfdm) 1.10.0.0 + + {{init copy: `bool`, optional}} + + .. versionadded:: (cfdm) 1.10.0.0 + + missing_values: Deprecated at version NEXTVERSION + The missing value indicators defined by the netCDF + variable attributes. They may now be recorded via the + *attributes* parameter + + ncvar: Deprecated at version 1.10.1.0 + Use the *address* parameter instead. + + varid: Deprecated at version 1.10.1.0 + Use the *address* parameter instead. + + group: Deprecated at version 1.10.1.0 + Use the *address* parameter instead. + + units: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + """ + super().__init__(source=source, copy=copy) + + if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + mask = source._get_component("mask", True) + except AttributeError: + mask = True + + try: + unpack = source._get_component("unpack", True) + except AttributeError: + unpack = True + + try: + attributes = source._get_component("attributes", None) + except AttributeError: + attributes = None + + try: + storage_options = source._get_component( + "storage_options", None + ) + except AttributeError: + storage_options = None + + if shape is not None: + self._set_component("shape", shape, copy=False) + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, (str, int)): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + self._set_component("dtype", dtype, copy=False) + self._set_component("mask", bool(mask), copy=False) + self._set_component("unpack", bool(unpack), copy=False) + self._set_component("storage_options", storage_options, copy=False) + self._set_component("attributes", attributes, copy=False) + + # By default, close the netCDF file after data array access + self._set_component("close", True, copy=False) + + def __getitem__(self, indices): + """Returns a subspace of the array as a numpy array. + + x.__getitem__(indices) <==> x[indices] + + The indices that define the subspace must be either `Ellipsis` or + a sequence that contains an index for each dimension. In the + latter case, each dimension's index must either be a `slice` + object or a sequence of two or more integers. + + Indexing is similar to numpy indexing. The only difference to + numpy indexing (given the restrictions on the type of indices + allowed) is: + + * When two or more dimension's indices are sequences of integers + then these indices work independently along each dimension + (similar to the way vector subscripts work in Fortran). + + .. versionadded:: (cfdm) 1.7.0 + + """ + netcdf, address = self.open() + dataset = netcdf + + groups, address = self.get_groups(address) + if groups: + # Traverse the group structure, if there is one (CF>=1.8). + netcdf = self._group(netcdf, groups) + + if isinstance(address, str): + # Get the variable by netCDF name + variable = netcdf.variables[address] + else: + # Get the variable by netCDF integer ID + for variable in netcdf.variables.values(): + if variable._varid == address: + break + + # Get the data, applying masking and scaling as required. + array = netcdf_indexer( + variable, + mask=self.get_mask(), + unpack=self.get_unpack(), + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) + array = array[indices] + + # Set the attributes, if they haven't been set already. + self._set_attributes(variable) + + self.close(dataset) + del netcdf, dataset + + if not self.ndim: + # Hmm netCDF4 has a thing for making scalar size 1, 1d + array = array.squeeze() + + return array + + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return f"<{self.__class__.__name__}{self.shape}: {self}>" + + def __str__(self): + """Called by the `str` built-in function. + + x.__str__() <==> str(x) + + """ + return f"{self.get_filename(None)}, {self.get_address()}" + + def _set_attributes(self, var): + """Set the netCDF variable attributes. + + These are set from the netCDF variable attributes, but only if + they have not already been defined, either during `{{class}}` + instantiation or by a previous call to `_set_attributes`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: `netCDF4.Variable` + The netCDF variable. + + :Returns: + + `dict` + The attributes. + + """ + attributes = self._get_component("attributes", None) + if attributes is not None: + return + + attributes = {attr: var.getncattr(attr) for attr in var.ncattrs()} + self._set_component("attributes", attributes, copy=False) + + def get_groups(self, address): + """The netCDF4 group structure of a netCDF variable. + + .. versionadded:: (cfdm) 1.8.6.0 + + :Parameters: + + address: `str` or `int` + The netCDF variable name, or integer varid, from which + to get the groups. + + .. versionadded:: (cfdm) 1.10.1.0 + + :Returns: + + (`list`, `str`) or (`list`, `int`) + The group structure and the name within the group. If + *address* is a varid then an empty list and the varid + are returned. + + **Examples** + + >>> n.get_groups('tas') + ([], 'tas') + + >>> n.get_groups('/tas') + ([], 'tas') + + >>> n.get_groups('/data/model/tas') + (['data', 'model'], 'tas') + + >>> n.get_groups(9) + ([], 9) + + """ + try: + if "/" not in address: + return [], address + except TypeError: + return [], address + + out = address.split("/")[1:] + return out[:-1], out[-1] + + def close(self, dataset): + """Close the dataset containing the data. + + .. versionadded:: (cfdm) 1.7.0 + + :Parameters: + + dataset: `netCDF4.Dataset` + The netCDF dataset to be closed. + + :Returns: + + `None` + + """ + if self._get_component("close"): + dataset.close() + + def open(self): + """Return a dataset file object and address. + + When multiple files have been provided an attempt is made to + open each one, in the order stored, and a file object is + returned from the first file that exists. + + :Returns: + + (`netCDF4.Dataset`, `str`) + The file object open in read-only mode, and the + address of the data within the file. + + """ + return super().open(netCDF4.Dataset, mode="r") diff --git a/cfdm/data/netcdfarray.py b/cfdm/data/netcdfarray.py index 247c2747cf..60a4cbfe69 100644 --- a/cfdm/data/netcdfarray.py +++ b/cfdm/data/netcdfarray.py @@ -1,13 +1,14 @@ -import netCDF4 -import numpy as np +class DeprecationError(Exception): + """Deprecation error.""" -from . import abstract -from .mixin import FileArrayMixin -from .numpyarray import NumpyArray + pass -class NetCDFArray(FileArrayMixin, abstract.Array): - """An underlying array stored in a netCDF file. +class NetCDFArray: + """A netCDF array accessed with `netCDF4`. + + Deprecated at version NEXTVERSION and is no longer available. Use + `cfdm.NetCDF4Array` instead. .. versionadded:: (cfdm) 1.7.0 @@ -20,9 +21,11 @@ def __init__( dtype=None, shape=None, mask=True, + unpack=True, units=False, calendar=False, - missing_values=None, + attributes=None, + storage_options=None, source=None, copy=True, ): @@ -54,14 +57,7 @@ def __init__( ndim: `int` The number of array dimensions in the netCDF file. - mask: `bool` - If True (the default) then mask by convention when - reading data from disk. - - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. + {{init mask: `bool`, optional}} .. versionadded:: (cfdm) 1.8.2 @@ -80,13 +76,6 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.1 - missing_values: `dict`, optional - The missing value indicators defined by the netCDF - variable attributes. See `get_missing_values` for - details. - - .. versionadded:: (cfdm) 1.10.0.3 - {{init source: optional}} .. versionadded:: (cfdm) 1.10.0.0 @@ -95,6 +84,11 @@ def __init__( .. versionadded:: (cfdm) 1.10.0.0 + missing_values: Deprecated at version NEXTVERSION + The missing value indicators defined by the netCDF + variable attributes. They may now be recorded via the + *attributes* parameter + ncvar: Deprecated at version 1.10.1.0 Use the *address* parameter instead. @@ -105,413 +99,7 @@ def __init__( Use the *address* parameter instead. """ - super().__init__(source=source, copy=copy) - - if source is not None: - try: - shape = source._get_component("shape", None) - except AttributeError: - shape = None - - try: - filename = source._get_component("filename", None) - except AttributeError: - filename = None - - try: - address = source._get_component("address", None) - except AttributeError: - address = None - - try: - dtype = source._get_component("dtype", None) - except AttributeError: - dtype = None - - try: - mask = source._get_component("mask", True) - except AttributeError: - mask = True - - try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) - except AttributeError: - calendar = False - - try: - missing_values = source._get_component("missing_values", None) - except AttributeError: - missing_values = None - - if shape is not None: - self._set_component("shape", shape, copy=False) - - if filename is not None: - if isinstance(filename, str): - filename = (filename,) - else: - filename = tuple(filename) - - self._set_component("filename", filename, copy=False) - - if address is not None: - if isinstance(address, (str, int)): - address = (address,) - else: - address = tuple(address) - - self._set_component("address", address, copy=False) - - if missing_values is not None: - self._set_component( - "missing_values", missing_values.copy(), copy=False - ) - - self._set_component("dtype", dtype, copy=False) - self._set_component("mask", mask, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) - - # By default, close the netCDF file after data array access - self._set_component("close", True, copy=False) - - def __getitem__(self, indices): - """Returns a subspace of the array as a numpy array. - - x.__getitem__(indices) <==> x[indices] - - The indices that define the subspace must be either `Ellipsis` or - a sequence that contains an index for each dimension. In the - latter case, each dimension's index must either be a `slice` - object or a sequence of two or more integers. - - Indexing is similar to numpy indexing. The only difference to - numpy indexing (given the restrictions on the type of indices - allowed) is: - - * When two or more dimension's indices are sequences of integers - then these indices work independently along each dimension - (similar to the way vector subscripts work in Fortran). - - .. versionadded:: (cfdm) 1.7.0 - - """ - netcdf, address = self.open() - dataset = netcdf - - mask = self.get_mask() - groups, address = self.get_groups(address) - - if groups: - # Traverse the group structure, if there is one (CF>=1.8). - for g in groups[:-1]: - netcdf = netcdf.groups[g] - - netcdf = netcdf.groups[groups[-1]] - - if isinstance(address, str): - # Get the variable by netCDF name - variable = netcdf.variables[address] - variable.set_auto_mask(mask) - array = variable[indices] - else: - # Get the variable by netCDF integer ID - for variable in netcdf.variables.values(): - if variable._varid == address: - variable.set_auto_mask(mask) - array = variable[indices] - break - - # Set the units, if they haven't been set already. - self._set_units(variable) - - self.close(dataset) - del netcdf, dataset - - string_type = isinstance(array, str) - if string_type: - # -------------------------------------------------------- - # A netCDF string type scalar variable comes out as Python - # str object, so convert it to a numpy array. - # -------------------------------------------------------- - array = np.array(array, dtype=f"U{len(array)}") - - if not self.ndim: - # Hmm netCDF4 has a thing for making scalar size 1, 1d - array = array.squeeze() - - kind = array.dtype.kind - if not string_type and kind in "SU": - # == 'S' and array.ndim > (self.ndim - - # getattr(self, 'gathered', 0) - - # getattr(self, 'ragged', 0)): - # -------------------------------------------------------- - # Collapse (by concatenation) the outermost (fastest - # varying) dimension of char array into - # memory. E.g. [['a','b','c']] becomes ['abc'] - # -------------------------------------------------------- - if kind == "U": - array = array.astype("S", copy=False) - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="U") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == "", array) - elif not string_type and kind == "O": - # -------------------------------------------------------- - # A netCDF string type N-d (N>=1) variable comes out as a - # numpy object array, so convert it to numpy string array. - # -------------------------------------------------------- - array = array.astype("U", copy=False) - - # -------------------------------------------------------- - # netCDF4 does not auto-mask VLEN variable, so do it here. - # -------------------------------------------------------- - array = np.ma.where(array == "", np.ma.masked, array) - - return array - - def __repr__(self): - """Called by the `repr` built-in function. - - x.__repr__() <==> repr(x) - - """ - return f"<{self.__class__.__name__}{self.shape}: {self}>" - - def __str__(self): - """Called by the `str` built-in function. - - x.__str__() <==> str(x) - - """ - return f"{self.get_filename(None)}, {self.get_address()}" - - def _set_units(self, var): - """The units and calendar properties. - - These are set from the netCDF variable attributes, but only if - they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. - - .. versionadded:: (cfdm) 1.10.0.1 - - :Parameters: - - var: `netCDF4.Variable` - The variable containing the units and calendar - definitions. - - :Returns: - - `tuple` - The units and calendar values, either of which may be - `None`. - - """ - # Note: Can't use None as the default since it is a valid - # `units` or 'calendar' value that indicates that the - # attribute has not been set in the dataset. - units = self._get_component("units", False) - if units is False: - try: - units = var.getncattr("units") - except AttributeError: - units = None - - self._set_component("units", units, copy=False) - - calendar = self._get_component("calendar", False) - if calendar is False: - try: - calendar = var.getncattr("calendar") - except AttributeError: - calendar = None - - self._set_component("calendar", calendar, copy=False) - - return units, calendar - - @property - def array(self): - """Return an independent numpy array containing the data. - - .. versionadded:: (cfdm) 1.7.0 - - :Returns: - - `numpy.ndarray` - An independent numpy array of the data. - - **Examples** - - >>> n = numpy.asanyarray(a) - >>> isinstance(n, numpy.ndarray) - True - - """ - return self[...] - - def get_format(self): - """The format of the files. - - .. versionadded:: (cfdm) 1.10.1.0 - - .. seealso:: `get_address`, `get_filename`, `get_formats` - - :Returns: - - `str` - The file format. Always ``'nc'``, signifying netCDF. - - **Examples** - - >>> a.get_format() - 'nc' - - """ - return "nc" - - def get_groups(self, address): - """The netCDF4 group structure of a netCDF variable. - - .. versionadded:: (cfdm) 1.8.6.0 - - :Parameters: - - address: `str` or `int` - The netCDF variable name, or integer varid, from which - to get the groups. - - .. versionadded:: (cfdm) 1.10.1.0 - - :Returns: - - (`list`, `str`) or (`list`, `int`) - The group structure and the name within the group. If - *address* is a varid then an empty list and the varid - are returned. - - **Examples** - - >>> n.get_groups('tas') - ([], 'tas') - - >>> n.get_groups('/tas') - ([], 'tas') - - >>> n.get_groups('/data/model/tas') - (['data', 'model'], 'tas') - - >>> n.get_groups(9) - ([], 9) - - """ - try: - if "/" not in address: - return [], address - except TypeError: - return [], address - - out = address.split("/")[1:] - return out[:-1], out[-1] - - def get_mask(self): - """Whether or not to automatically mask the data. - - .. versionadded:: (cfdm) 1.8.2 - - **Examples** - - >>> b = a.get_mask() - - """ - return self._get_component("mask") - - def get_missing_values(self): - """The missing value indicators from the netCDF variable. - - .. versionadded:: (cfdm) 1.10.0.3 - - :Returns: - - `dict` or `None` - The missing value indicators from the netCDF variable, - keyed by their netCDF attribute names. An empty - dictionary signifies that no missing values are given - in the file. `None` signifies that the missing values - have not been set. - - **Examples** - - >>> a.get_missing_values() - None - - >>> b.get_missing_values() - {} - - >>> c.get_missing_values() - {'missing_value': 1e20, 'valid_range': (-10, 20)} - - >>> d.get_missing_values() - {'valid_min': -999} - - """ - out = self._get_component("missing_values", None) - if out is None: - return - - return out.copy() - - def close(self, dataset): - """Close the dataset containing the data. - - .. versionadded:: (cfdm) 1.7.0 - - :Parameters: - - dataset: `netCDF4.Dataset` - The netCDF dataset to be be closed. - - :Returns: - - `None` - - """ - if self._get_component("close"): - dataset.close() - - def open(self): - """Return an open file object containing the data array. - - When multiple files have been provided an attempt is made to - open each one, in the order stored, and an open file object is - returned from the first file that exists. - - :Returns: - - (`netCDF4.Dataset`, `str`) - The open file object, and the address of the data - within the file. - - """ - return super().open(netCDF4.Dataset, mode="r") - - def to_memory(self): - """Bring data on disk into memory. - - .. versionadded:: (cfdm) 1.7.0 - - :Returns: - - `NumpyArray` - The new with all of its data in memory. - - """ - return NumpyArray(self[...]) + raise DeprecationError( + f"{self.__class__.__name__} was deprecated at version NEXTVERSION " + "and is no longer available. Use cfdm.NetCDF4Array instead." + ) diff --git a/cfdm/data/netcdfindexer.py b/cfdm/data/netcdfindexer.py new file mode 100644 index 0000000000..28242d7457 --- /dev/null +++ b/cfdm/data/netcdfindexer.py @@ -0,0 +1,922 @@ +"""A data indexer that applies netCDF masking and unpacking. + +Portions of this code were adapted from the `netCDF4` Python library, +which carries the following MIT License: + +Copyright 2008 Jeffrey Whitaker + +https://opensource.org/license/mit + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +""" + +import logging +from math import prod +from numbers import Integral + +import numpy as np +from dask.array.slicing import normalize_index +from netCDF4 import chartostring, default_fillvals +from netCDF4.utils import _safecast + +logger = logging.getLogger(__name__) + + +class netcdf_indexer: + """A data indexer that also applies netCDF masking and unpacking. + + Indexing may be orthogonal or non-orthogonal. Orthogonal indexing + means that the index for each dimension is applied independently, + regardless of how that index was defined. For instance, the + indices ``[[0, 1], [1, 3], 0]`` and ``[:2, 1:4:2, 0]`` will give + identical results. Orthogonal indexing is different to the + indexing behaviour of `numpy`. Non-orthogonal indexing means that + normal `numpy` indexing rules are applied. + + In addition, string and character variables are always converted + to unicode arrays, the latter with the last dimension + concatenated. + + Masking and unpacking operations, either or both may be disabled + via initialisation options, are defined by the conventions for + netCDF attributes, which are either provided as part of the input + *variable* object, or given with the input *attributes* parameter. + + The relevant netCDF attributes that are considered are: + + * For masking: ``missing_value``, ``valid_max``, ``valid_min``, + ``valid_range``, ``_FillValue``, and + ``_Unsigned``. Note that if ``_FillValue`` is not + present then the netCDF default value for the + appropriate data type will be assumed, as defined + by `netCDF4.default_fillvals`. + + * For unpacking: ``add_offset``, ``scale_factor``, and + ``_Unsigned`` + + .. versionadded:: (cfdm) NEXTVERSION + + **Examples** + + >>> import netCDF4 + >>> nc = netCDF4.Dataset('file.nc', 'r') + >>> x = cfdm.netcdf_indexer(nc.variables['x']) + >>> x.shape + (12, 64, 128) + >>> print(x[0, 0:4, 0:3]) + [[236.5, 236.2, 236.0], + [240.9, -- , 239.6], + [243.4, 242.4, 241.3], + [243.1, 241.7, 240.4]] + + >>> import h5netcdf + >>> h5 = h5netcdf.File('file.nc', 'r') + >>> x = cfdm.netcdf_indexer(h5.variables['x']) + >>> x.shape + (12, 64, 128) + >>> print(x[0, 0:4, 0:3]) + [[236.5, 236.2, 236.0], + [240.9, -- , 239.6], + [243.4, 242.4, 241.3], + [243.1, 241.7, 240.4]] + + >>> import numpy as np + >>> n = np.arange(7) + >>> x = cfdm.netcdf_indexer(n) + >>> x.shape + (7,) + >>> print(x[...]) + [0 1 2 3 4 5 6] + >>> x = cfdm.netcdf_indexer(n, attributes={'_FillValue': 4}) + >>> print(x[...]) + [0 1 2 3 -- 5 6] + >>> x = cfdm.netcdf_indexer(n, mask=False, attributes={'_FillValue': 4}) + >>> print(x[...]) + [0 1 2 3 4 5 6] + + """ + + def __init__( + self, + variable, + mask=True, + unpack=True, + always_masked_array=False, + orthogonal_indexing=False, + attributes=None, + copy=False, + ): + """**Initialisation** + + :Parameters: + + variable: + The variable to be indexed. May be any variable that + has the same API as one of `numpy.ndarray`, + `netCDF4.Variable`, or `h5py.Variable` (which includes + `h5netcdf.Variable`). Any masking and unpacking that + could be applied by *variable* itself (e.g. by a + `netCDF4.Variable` instance) is disabled, ensuring + that any masking and unpacking is always done by the + `netcdf_indexer` instance. + + mask: `bool`, optional + If True, the default, then an array returned by + indexing is automatically masked. Masking is + determined by the netCDF conventions for the following + attributes: ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_max``, ``valid_min``, and + ``valid_range``. + + unpack: `bool`, optional + If True, the default, then an array returned by + indexing is automatically unpacked. Unpacking is + determined by the netCDF conventions for the following + attributes: ``add_offset``, ``scale_factor``, and + ``_Unsigned``. + + always_masked_array: `bool`, optional + If False, the default, then an array returned by + indexing which has no missing values is created as a + regular `numpy` array. If True then an array returned + by indexing is always a masked `numpy` array, even if + there are no missing values. + + orthogonal_indexing: `bool`, optional + If True then indexing is orthogonal, meaning that the + index for each dimension is applied independently, + regardless of how that index was defined. For + instance, the indices ``[[0, 1], [1, 3], 0]`` and + ``[:2, 1:4:2, 0]`` will give identical results. This + behaviour is different to that of `numpy`. If False, + the default, then normal `numpy` indexing rules are + applied. + + attributes: `dict`, optional + Provide netCDF attributes for the *variable* as a + dictionary of key/value pairs. Only the attributes + relevant to masking and unpacking are considered, with + all other attributes being ignored. If *attributes* is + `None`, the default, then the netCDF attributes stored + by *variable* (if any) are used. If *attributes* is + not `None`, then any netCDF attributes stored by + *variable* are ignored. + + copy: `bool`, optional + If True then return a `numpy` array that is not a view + of part of the original data, i.e. in-place + changes to the returned subspace will not affect the + original *variable*. This is done by returning an + in-memory copy the subspace. If False, the default, no + in-memory copy is made, and then whether or not + in-place changes to the returned subspace affect + *variable* will depend on how subspacing is + implemented by *variable*. + + """ + self.variable = variable + self.mask = bool(mask) + self.unpack = bool(unpack) + self.always_masked_array = bool(always_masked_array) + self._attributes = attributes + self._copy = bool(copy) + self._orthogonal_indexing = bool(orthogonal_indexing) + + def __getitem__(self, index): + """Return a subspace of the variable as a `numpy` array. + + n.__getitem__(index) <==> v[index] + + If `__orthogonal_indexing__` is True then indexing is + orthogonal. If `__orthogonal_indexing__` is False then normal + `numpy` indexing rules are applied. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + variable = self.variable + unpack = self.unpack + attributes = self.attributes() + dtype = variable.dtype + + # Prevent a netCDF4 variable from doing its own masking and + # unpacking during the indexing + netCDF4_scale = False + netCDF4_mask = False + try: + netCDF4_scale = variable.scale + except AttributeError: + # Not a netCDF4 variable + pass + else: + netCDF4_mask = variable.mask + variable.set_auto_maskandscale(False) + + # ------------------------------------------------------------ + # Index the variable + # ------------------------------------------------------------ + data = self._index(index) + + # Reset a netCDF4 variable's scale and mask behaviour + if netCDF4_scale: + variable.set_auto_scale(True) + + if netCDF4_mask: + variable.set_auto_mask(True) + + # Convert str, char, and object data to byte strings + if isinstance(data, str): + data = np.array(data, dtype="S") + elif data.dtype.kind in "OSU": + kind = data.dtype.kind + if kind == "S": + data = chartostring(data) + + # Assume that object arrays are arrays of strings + data = data.astype("S", copy=False) + if kind == "O": + dtype = data.dtype + + if dtype is str: + dtype = data.dtype + + dtype_unsigned_int = None + if unpack: + is_unsigned_int = attributes.get("_Unsigned") in ("true", "True") + if is_unsigned_int: + data_dtype = data.dtype + dtype_unsigned_int = ( + f"{data_dtype.byteorder}u{data_dtype.itemsize}" + ) + data = data.view(dtype_unsigned_int) + + # ------------------------------------------------------------ + # Mask the data + # ------------------------------------------------------------ + if self.mask: + data = self._mask(data, dtype, attributes, dtype_unsigned_int) + + # ------------------------------------------------------------ + # Unpack the data + # ------------------------------------------------------------ + if unpack: + data = self._unpack(data, attributes) + + # Make sure all strings are unicode + if data.dtype.kind == "S": + data = data.astype("U", copy=False) + + # ------------------------------------------------------------ + # Copy the data + # ------------------------------------------------------------ + if self._copy: + data = data.copy() + + return data + + @property + def __orthogonal_indexing__(self): + """Flag to indicate whether indexing is orthogonal. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self._orthogonal_indexing + + def _check_safecast(self, attr, dtype, attributes): + """Check an attribute's data type. + + Checks to see that variable attribute exists and can be safely + cast to variable's data type. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameter: + + attr: `str` + The name of the attribute. + + dtype: `numpy.dtype` + The variable data type. + + attributes: `dict` + The variable attributes. + + :Returns: + + `bool`, value + Whether or not the attribute data type is consistent + with the variable data type, and the attribute value. + + """ + if attr in attributes: + attvalue = attributes[attr] + att = np.array(attvalue) + else: + return False, None + + try: + atta = np.array(att, dtype) + except ValueError: + safe = False + else: + safe = _safecast(att, atta) + + if not safe: + logger.info( + f"Mask attribute {attr!r} not used since it can't " + f"be safely cast to variable data type {dtype!r}" + ) # pragma: no cover + + return safe, attvalue + + def _default_FillValue(self, dtype): + """Return the default ``_FillValue`` for the given data type. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `netCDF4.default_fillvals` + + :Parameter: + + dtype: `numpy.dtype` + The data type. + + :Returns: + + The default ``_FillValue``. + + """ + if dtype.kind in "OS": + return default_fillvals["S1"] + + return default_fillvals[dtype.str[1:]] + + def _index(self, index): + """Get a subspace of the variable. + + .. versionadded:: (cfdm) NEXTVERSION + + .. seealso:: `__getitem__` + + :Parameter: + + index: + The indices that define the subspace. + + :Returns: + + `numpy.ndarray` + The subspace of the variable. + + """ + data = self.variable + if index is Ellipsis: + return data[...] + + index = normalize_index(index, data.shape) + + # Find the positions of any list/1-d array indices (which by + # now will contain only integers) + axes_with_list_indices = [ + n + for n, i in enumerate(index) + if isinstance(i, list) or getattr(i, "shape", False) + ] + + data_orthogonal_indexing = getattr( + data, "__orthogonal_indexing__", False + ) + if not self.__orthogonal_indexing__: + # -------------------------------------------------------- + # Do non-orthogonal indexing + # -------------------------------------------------------- + if data_orthogonal_indexing and len(axes_with_list_indices) > 1: + raise IndexError( + "Can't non-orthogonally index " + f"{data.__class__.__name__} with index {index!r}" + ) + + return data[index] + + # ------------------------------------------------------------ + # Still here? Then do orthogonal indexing. + # ------------------------------------------------------------ + + # Create an index that replaces integers with size 1 slices, + # so that their axes are not dropped yet (they will be dropped + # later). + index0 = [ + slice(i, i + 1) if isinstance(i, Integral) else i for i in index + ] + + if data_orthogonal_indexing or len(axes_with_list_indices) <= 1: + # There is at most one list/1-d array index, and/or the + # variable natively supports orthogonal indexing. + # + # Note: `netCDF4.Variable` natively supports orthogonal + # indexing; but `h5netcdf.File`, `h5py.File`, and + # `numpy.ndarray`, do not. + data = data[tuple(index0)] + else: + # There are two or more list/1-d array indices, and the + # variable does not natively support orthogonal indexing + # => emulate orthogonal indexing with a sequence of + # independent subspaces, one for each list/1-d array + # index. + + # 1) Apply the slice indices at the same time as the + # list/1-d array index that gives the smallest result. + + # Create an index that replaces each list/1-d array with + # slice(None) + index1 = [ + i if isinstance(i, slice) else slice(None) for i in index0 + ] + + # Find the position of the list/1-d array index that gives + # the smallest result, and apply the subspace of slices + # and the chosen list/1-d array index. This will give the + # smallest high-water memory mark of the whole operation. + shape1 = self.index_shape(index1, data.shape) + size1 = prod(shape1) + sizes = [ + size1 * (len(index[i]) // shape1[i]) + for i in axes_with_list_indices + ] + n = axes_with_list_indices.pop(np.argmin(sizes)) + index1[n] = index[n] + data = data[tuple(index1)] + + # 2) Apply the rest of the list/1-d array indices, in the + # order that gives the smallest result after each step. + ndim = data.ndim + while axes_with_list_indices: + shape1 = data.shape + size1 = data.size + sizes = [ + len(index[i]) * size1 // shape1[i] + for i in axes_with_list_indices + ] + n = axes_with_list_indices.pop(np.argmin(sizes)) + index2 = [slice(None)] * ndim + index2[n] = index[n] + data = data[tuple(index2)] + + # Apply any integer indices that will drop axes + index3 = [0 if isinstance(i, Integral) else slice(None) for i in index] + if index3: + data = data[tuple(index3)] + + return data + + def _mask(self, data, dtype, attributes, dtype_unsigned_int): + """Mask the data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameter: + + data: `numpy.ndarray` + The unmasked and (possibly) packed data. + + dtype: `numpy.dtype` + The data type of the variable (which may be different + to that of *data*). + + attributes: `dict` + The variable attributes. + + dtype_unsigned_int: `dtype` or `None` + The data type when the data have been cast to unsigned + integers, otherwise `None`. + + :Returns: + + `numpy.ndarray` + The masked data. + + """ + # The Boolean mask accounting for all methods of specification + totalmask = None + # The fill value for the returned numpy array + fill_value = None + + safe_missval, missing_value = self._check_safecast( + "missing_value", dtype, attributes + ) + if safe_missval: + # -------------------------------------------------------- + # Create mask from missing_value + # -------------------------------------------------------- + mval = np.array(missing_value, dtype) + if dtype_unsigned_int is not None: + mval = mval.view(dtype_unsigned_int) + + if not mval.ndim: + mval = (mval,) + + for m in mval: + try: + mvalisnan = np.isnan(m) + except TypeError: + # isnan fails on some dtypes + mvalisnan = False + + if mvalisnan: + mask = np.isnan(data) + else: + mask = data == m + + if mask.any(): + if totalmask is None: + totalmask = mask + else: + totalmask += mask + + if totalmask is not None: + fill_value = mval[0] + + # Set mask=True for data == fill value + safe_fillval, _FillValue = self._check_safecast( + "_FillValue", dtype, attributes + ) + if not safe_fillval: + _FillValue = self._default_FillValue(dtype) + safe_fillval = True + + if safe_fillval: + # -------------------------------------------------------- + # Create mask from _FillValue + # -------------------------------------------------------- + fval = np.array(_FillValue, dtype) + if dtype_unsigned_int is not None: + fval = fval.view(dtype_unsigned_int) + + if fval.ndim == 1: + # _FillValue must be a scalar + fval = fval[0] + + try: + fvalisnan = np.isnan(fval) + except Exception: + # isnan fails on some dtypes + fvalisnan = False + + if fvalisnan: + mask = np.isnan(data) + else: + mask = data == fval + + if mask.any(): + if fill_value is None: + fill_value = fval + + if totalmask is None: + totalmask = mask + else: + totalmask += mask + + # Set mask=True for data outside [valid_min, valid_max] + # + # If valid_range exists use that, otherwise look for + # valid_min, valid_max. No special treatment of byte data as + # described in the netCDF documentation. + validmin = None + validmax = None + safe_validrange, valid_range = self._check_safecast( + "valid_range", dtype, attributes + ) + safe_validmin, valid_min = self._check_safecast( + "valid_min", dtype, attributes + ) + safe_validmax, valid_max = self._check_safecast( + "valid_max", dtype, attributes + ) + if safe_validrange and valid_range.size == 2: + validmin = np.array(valid_range[0], dtype) + validmax = np.array(valid_range[1], dtype) + else: + if safe_validmin: + validmin = np.array(valid_min, dtype) + + if safe_validmax: + validmax = np.array(valid_max, dtype) + + if dtype_unsigned_int is not None: + if validmin is not None: + validmin = validmin.view(dtype_unsigned_int) + + if validmax is not None: + validmax = validmax.view(dtype_unsigned_int) + + if dtype.kind != "S": + # -------------------------------------------------------- + # Create mask from valid_min. valid_max, valid_range + # -------------------------------------------------------- + # Don't set validmin/validmax mask for character data + # + # Setting valid_min/valid_max to the _FillVaue is too + # surprising for many users (despite the netcdf docs + # attribute best practices suggesting clients should do + # this). + if validmin is not None: + if validmin.ndim == 1: + # valid min must be a scalar + validmin = validmin[0] + + mask = data < validmin + if totalmask is None: + totalmask = mask + else: + totalmask += mask + + if validmax is not None: + if validmax.ndim == 1: + # valid max must be a scalar + validmax = validmax[0] + + mask = data > validmax + if totalmask is None: + totalmask = mask + else: + totalmask += mask + + # ------------------------------------------------------------ + # Mask the data + # ------------------------------------------------------------ + if totalmask is not None and totalmask.any(): + data = np.ma.masked_array( + data, mask=totalmask, fill_value=fill_value, copy=False + ) + if not data.ndim: + # Return a scalar numpy masked constant not a 0-d + # masked array, so that data == np.ma.masked. + data = data[()] + elif np.ma.isMA(data): + if not (self.always_masked_array or np.ma.is_masked(data)): + # Return a non-masked array + data = np.array(data, copy=False) + elif self.always_masked_array: + # Return a masked array + data = np.ma.masked_array(data, copy=False) + + return data + + def _unpack(self, data, attributes): + """Unpack the data. + + If both the ``add_offset`` and ``scale_factor`` attributes + have not been set then no unpacking is done and the data is + returned unchanged. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameter: + + data: `numpy.ndarray` + The masked and (possibly) packed data. + + attributes: `dict` + The variable attributes. + + :Returns: + + `numpy.ndarray` + The unpacked data. + + """ + scale_factor = attributes.get("scale_factor") + add_offset = attributes.get("add_offset") + + try: + if scale_factor is not None: + scale_factor = np.array(scale_factor) + if scale_factor.ndim == 1: + # scale_factor must be a scalar + scale_factor = scale_factor[0] + + float(scale_factor) + except ValueError: + logging.warn( + "No unpacking done: 'scale_factor' attribute " + f"{scale_factor!r} can't be converted to a float" + ) # pragma: no cover + return data + + try: + if add_offset is not None: + add_offset = np.array(add_offset) + if add_offset.ndim == 1: + # add_offset must be a scalar + add_offset = add_offset[0] + + float(add_offset) + except ValueError: + logging.warn( + "No unpacking done: 'add_offset' attribute " + f"{add_offset!r} can't be converted to a float" + ) # pragma: no cover + return data + + if scale_factor is not None: + if add_offset is not None: + # scale_factor and add_offset + if add_offset != 0.0 or scale_factor != 1.0: + data = data * scale_factor + add_offset + self._copy = False + else: + data = data.astype(np.array(scale_factor).dtype) + else: + # scale_factor with no add_offset + if scale_factor != 1.0: + data = data * scale_factor + self._copy = False + else: + data = data.astype(scale_factor.dtype) + elif add_offset is not None: + # add_offset with no scale_factor + if add_offset != 0.0: + data = data + add_offset + self._copy = False + else: + data = data.astype(np.array(add_offset).dtype) + + return data + + @property + def dtype(self): + """The data type of the array elements. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.dtype + + @property + def ndim(self): + """Number of dimensions in the data array. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.ndim + + @property + def shape(self): + """Tuple of the data dimension sizes. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.shape + + @property + def size(self): + """Number of elements in the data array. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + return self.variable.size + + def attributes(self): + """Return the netCDF attributes for the data. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `dict` + The attributes. + + **Examples** + + >>> n.attributes() + {'standard_name': 'air_temperature', + 'missing_value': -999.0} + + """ + _attributes = self._attributes + if _attributes is not None: + return _attributes.copy() + + variable = self.variable + try: + # h5py API + attrs = dict(variable.attrs) + except AttributeError: + try: + # netCDF4 API + attrs = { + attr: variable.getncattr(attr) + for attr in variable.ncattrs() + } + except AttributeError: + # numpy API + attrs = {} + + self._attributes = attrs + return attrs + + @classmethod + def index_shape(cls, index, shape): + """Return the shape of the array subspace implied by indices. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + index: `tuple` + The indices to be applied to an array with shape + *shape*. + + shape: sequence of `int` + The shape of the array to be subspaced. + + :Returns: + + `list` + The shape of the subspace defined by the *index*. + + **Examples** + + >>> import numpy as np + >>> n.index_shape((slice(2, 5), [4]), (10, 20)) + [3, 1] + >>> n.index_shape((slice(2, 5), 4), (10, 20)) + [3] + >>> n.index_shape(([2, 3, 4], np.arange(1, 6)), (10, 20)) + [3, 5] + + >>> n.index_shape((slice(None), [True, False, True]), (10, 3)) + [10, 2] + + >>> index0 = np.arange(5) + >>> index0 = index0[index0 < 3] + >>> n.index_shape((index0, []), (10, 20)) + [3, 0] + + >>> n.index_shape((slice(1, 5, 3), [3]), (10, 20)) + [2, 1] + >>> n.index_shape((slice(5, 1, -2), 3), (10, 20)) + [2] + >>> n.index_shape((slice(5, 1, 3), [3]), (10, 20)) + [0, 1] + >>> n.index_shape((slice(1, 5, -3), 3), (10, 20)) + [0] + + """ + implied_shape = [] + for ind, full_size in zip(index, shape): + if isinstance(ind, slice): + start, stop, step = ind.indices(full_size) + if (stop - start) * step < 0: + # E.g. 5:1:3 or 1:5:-3 + size = 0 + else: + size = abs((stop - start) / step) + int_size = round(size) + if size > int_size: + size = int_size + 1 + else: + size = int_size + elif isinstance(ind, np.ndarray): + if ind.dtype == bool: + # Size is the number of True values in the array + size = int(ind.sum()) + else: + size = ind.size + + if not ind.ndim: + # Scalar array + continue + elif isinstance(ind, list): + if not ind: + size = 0 + else: + i = ind[0] + if isinstance(i, bool): + # List of bool: Size is the number of True + # values in the list + size = sum(ind) + else: + # List of int + size = len(ind) + else: + # Index is Integral + continue + + implied_shape.append(size) + + return implied_shape diff --git a/cfdm/data/numpyarray.py b/cfdm/data/numpyarray.py index 7c26af684a..114c2b0c45 100644 --- a/cfdm/data/numpyarray.py +++ b/cfdm/data/numpyarray.py @@ -1,5 +1,6 @@ from .. import core from .mixin import ArrayMixin +from .netcdfindexer import netcdf_indexer class NumpyArray(ArrayMixin, core.NumpyArray): @@ -31,9 +32,15 @@ def __getitem__(self, indices): .. versionadded:: (cfdm) 1.7.0 """ - return self.get_subspace( - self._get_component("array"), indices, copy=True + array = netcdf_indexer( + self._get_component("array"), + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=True, ) + return array[indices] def to_memory(self): """Bring data on disk into memory. diff --git a/cfdm/data/subsampledarray.py b/cfdm/data/subsampledarray.py index 15c68bd0a1..e40ff02495 100644 --- a/cfdm/data/subsampledarray.py +++ b/cfdm/data/subsampledarray.py @@ -6,6 +6,7 @@ from ..core.utils import cached_property from .abstract import CompressedArray +from .netcdfindexer import netcdf_indexer from .subarray import ( BiLinearSubarray, BiQuadraticLatitudeLongitudeSubarray, @@ -402,10 +403,15 @@ def __getitem__(self, indices): ) u[u_indices] = subarray[...] - if indices is Ellipsis: - return u - - return self.get_subspace(u, indices, copy=True) + u = netcdf_indexer( + u, + mask=False, + unpack=False, + always_masked_array=False, + orthogonal_indexing=True, + copy=False, + ) + return u[indices] def _conformed_dependent_tie_points(self): """Return the dependent tie points. diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py index 1e5247ced4..e42d7e2b58 100644 --- a/cfdm/docstring/docstring.py +++ b/cfdm/docstring/docstring.py @@ -401,6 +401,59 @@ "{{init cell_dimension: `int`}}": """cell_dimension: `int` The position of the *data* dimension that indexes the cells, either ``0`` or ``1``.""", + # init mask + "{{init mask: `bool`, optional}}": """mask: `bool`, optional + If True (the default) then mask by convention when + reading data from disk. + + A netCDF array is masked depending on the values of + any of the netCDF attributes ``_FillValue``, + ``missing_value``, ``_Unsigned``, ``valid_min``, + ``valid_max``, and ``valid_range``.""", + # init unpack + "{{init unpack: `bool`, optional}}": """unpack: `bool`, optional + If True (the default) then unpack by convention when + reading data from disk. + + A netCDF array is unpacked depending on the values of + the netCDF attributes ``add_offset`` and + ``scale_factor``.""", + # init attributes + "{{init attributes: `dict` or `None`, optional}}": """attributes: `dict` or `None`, optional + Provide netCDF attributes for the data as a dictionary + of key/value pairs.""", + # init storage_options + "{{init storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of files in S3 object stores. Ignored for + files not in an S3 object store, i.e. those whose + names do not start with ``s3:``. + + By default, or if `None`, then *storage_options* is + taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options* or is not in a dictionary defined by + the ``'client_kwargs`` key (which is always the case + when *storage_options* is `None`), then one will be + automatically inserted for accessing an S3 file. For + example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, + the following are equivalent: ``None``, ``{}``, and + ``{'endpoint_url': 'https://store'}``, + ``{'client_kwargs': {'endpoint_url': + 'https://store'}}`` + + *Parameter example:* + ``{'key': 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}``""", # ---------------------------------------------------------------- # Method description susbstitutions (4 levels of indentataion) # ---------------------------------------------------------------- diff --git a/cfdm/functions.py b/cfdm/functions.py index 874c128677..31f00c8dc4 100644 --- a/cfdm/functions.py +++ b/cfdm/functions.py @@ -4,10 +4,7 @@ from functools import total_ordering from urllib.parse import urlparse -import cftime -import netcdf_flattener import numpy as np -import scipy from . import __cf_version__, __file__, __version__, core from .constants import CONSTANTS, ValidLogLevels @@ -317,46 +314,61 @@ def environment(display=True, paths=True): **Examples** - >>> cfdm.environment() - Platform: Linux-5.14.0-1048-oem-x86_64-with-glibc2.31 - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 /home/user/miniconda3/bin/python - netCDF4: 1.6.0 /home/user/miniconda3/lib/python3.9/site-packages/netCDF4/__init__.py - numpy: 1.22.3 /home/user/miniconda3/lib/python3.9/site-packages/numpy/__init__.py - cfdm.core: 1.11.0.0 /home/user/miniconda3/lib/python3.9/site-packages/cfdm/core/__init__.py - scipy: 1.11.3 /home/user/miniconda3/lib/python3.11/site-packages/scipy/__init__.py - cftime: 1.6.1 /home/user/miniconda3/lib/python3.9/site-packages/cftime/__init__.py - netcdf_flattener: 1.2.0 /home/user/miniconda3/lib/python3.9/site-packages/netcdf_flattener/__init__.py - cfdm: 1.11.0.0 /home/user/miniconda3/lib/python3.9/site-packages/cfdm/__init__.py - >>> cfdm.environment(paths=False) - HDF5 library: 1.12.1 - netcdf library: 4.8.1 - Python: 3.9.12 - netCDF4: 1.6.0 - numpy: 1.22.3 - cfdm.core: 1.11.0.0 + Platform: Linux-5.15.0-92-generic-x86_64-with-glibc2.35 + Python: 3.11.4 + packaging: 23.0 + numpy: 1.25.2 + cfdm.core: NEXTVERSION + HDF5 library: 1.14.2 + netcdf library: 4.9.2 + netCDF4: 1.6.4 + h5netcdf: 1.3.0 + h5py: 3.10.0 + s3fs: 2023.12.2 + dask: 2024.7.0 scipy: 1.11.3 - cftime: 1.6.1 - netcdf_flattener: 1.2.0 - cfdm: 1.11.0.0 + cftime: 1.6.2 + cfdm: NEXTVERSION + + >>> cfdm.environment() + Platform: Linux-5.15.0-92-generic-x86_64-with-glibc2.35 + Python: 3.11.4 /home/miniconda3/bin/python + packaging: 23.0 /home/miniconda3/lib/python3.11/site-packages/packaging/__init__.py + numpy: 1.25.2 /home/miniconda3/lib/python3.11/site-packages/numpy/__init__.py + cfdm.core: NEXTVERSION /home/cfdm/cfdm/core/__init__.py + HDF5 library: 1.14.2 + netcdf library: 4.9.2 + netCDF4: 1.6.4 /home/miniconda3/lib/python3.11/site-packages/netCDF4/__init__.py + h5netcdf: 1.3.0 /home/miniconda3/lib/python3.11/site-packages/h5netcdf/__init__.py + h5py: 3.10.0 /home/miniconda3/lib/python3.11/site-packages/h5py/__init__.py + s3fs: 2023.12.2 /home/miniconda3/lib/python3.11/site-packages/s3fs/__init__.py + scipy: 1.11.3 /home/miniconda3/lib/python3.11/site-packages/scipy/__init__.py + dask: 2024.7.0 /home/miniconda3/lib/python3.11/site-packages/dask/__init__.py + cftime: 1.6.2 /home/miniconda3/lib/python3.11/site-packages/cftime/__init__.py + cfdm: NEXTVERSION /home/miniconda3/lib/python3.11/site-packages/cfdm/__init__.py """ - out = core.environment(display=False, paths=paths) # get all core env + import cftime + import dask + import h5netcdf + import h5py + import netCDF4 + import s3fs + import scipy - try: - netcdf_flattener_version = netcdf_flattener.__version__ - except AttributeError: - netcdf_flattener_version = "unknown version" + out = core.environment(display=False, paths=paths) # get all core env dependency_version_paths_mapping = { + "HDF5 library": (netCDF4.__hdf5libversion__, ""), + "netcdf library": (netCDF4.__netcdf4libversion__, ""), + "netCDF4": (netCDF4.__version__, os.path.abspath(netCDF4.__file__)), + "h5netcdf": (h5netcdf.__version__, os.path.abspath(h5netcdf.__file__)), + "h5py": (h5py.__version__, os.path.abspath(h5py.__file__)), + "s3fs": (s3fs.__version__, os.path.abspath(s3fs.__file__)), "scipy": (scipy.__version__, os.path.abspath(scipy.__file__)), + "dask": (dask.__version__, os.path.abspath(dask.__file__)), "cftime": (cftime.__version__, os.path.abspath(cftime.__file__)), - "netcdf_flattener": ( - netcdf_flattener_version, - os.path.abspath(netcdf_flattener.__file__), - ), "cfdm": (__version__, os.path.abspath(__file__)), } string = "{0}: {1!s}" @@ -982,9 +994,6 @@ def __str__(self): """Called by the `str` built-in function.""" return str(self.value) - # ---------------------------------------------------------------- - # Methods - # ---------------------------------------------------------------- def copy(self): """Return a deep copy. diff --git a/cfdm/mixin/propertiesdata.py b/cfdm/mixin/propertiesdata.py index 625c7d7f8d..dabf3914d3 100644 --- a/cfdm/mixin/propertiesdata.py +++ b/cfdm/mixin/propertiesdata.py @@ -100,7 +100,7 @@ def __str__(self): if units is None: isreftime = bool(self.get_property("calendar", False)) else: - isreftime = "since" in units + isreftime = "since" in str(units) if isreftime: units += " " + self.get_property("calendar", "") diff --git a/cfdm/read_write/netcdf/flatten/__init__.py b/cfdm/read_write/netcdf/flatten/__init__.py new file mode 100644 index 0000000000..82e6a3c9e6 --- /dev/null +++ b/cfdm/read_write/netcdf/flatten/__init__.py @@ -0,0 +1,16 @@ +"""Flatten NetCDF groups. + +Portions of this package were adapted from the `netcdf_flattener` +library, which carries the following Apache 2.0 License: + +Copyright (c) 2020 EUMETSAT + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. The ASF licenses this file to you +under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy +of the License at http://www.apache.org/licenses/LICENSE-2.0. + +""" + +from .flatten import netcdf_flatten diff --git a/cfdm/read_write/netcdf/flatten/config.py b/cfdm/read_write/netcdf/flatten/config.py new file mode 100644 index 0000000000..cb32eb0d81 --- /dev/null +++ b/cfdm/read_write/netcdf/flatten/config.py @@ -0,0 +1,232 @@ +"""Configuration for netCDF group flattening. + +.. versionadded:: (cfdm) NEXTVERSION + +Portions of this code were adapted from the `netcdf_flattener` +library, which carries the following Apache 2.0 License: + +Copyright (c) 2020 EUMETSAT + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. The ASF licenses this file to you +under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy +of the License at http://www.apache.org/licenses/LICENSE-2.0. + +""" + +from dataclasses import dataclass + +# Maximum length of name after which it is replaced with its hash +max_name_len = 256 + +# Separator for groups in the input dataset +group_separator = "/" + +# Replacment for 'group_separator' in flattened names +flattener_separator = "__" + +# Name prefix when reference can't be resolved. Only used if +# 'lax_mode=True' in `flatten`. +ref_not_found_error = "REF_NOT_FOUND" + +# NetCDF global attribute in the flattened dataset containing the +# mapping of flattened attribute names to grouped attribute names +flattener_attribute_map = "_flattener_attribute_map" + +# NetCDF global attribute in the flattened dataset containing the +# mapping of flattened dimension names to grouped attribute names +flattener_dimension_map = "_flattener_dimension_map" + +# NetCDF global attribute in the flattened dataset containing the +# mapping of flattened variable names to grouped attribute names +flattener_variable_map = "_flattener_variable_map" + + +@dataclass() +class FlatteningRules: + """Define the flattening rules for a netCDF attribute. + + For a named netCDF attribute, the rules a define how the contents + of the attribute are flattened. For instance, it has to be defined + that the ``ancillary_variables`` attribute contains the names of + other netCDF variables. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + # name: The name of attribute containing the reference to be + # flattened + name: str + # ref_to_dim: Positive integer if contains references to + # dimensions. If ref_to_dim and ref_to_var are both + # positive then the rule with the greater value is + # tested first. + ref_to_dim: int = 0 + # ref_to_var: Positive integer if contains references to + # variables. If ref_to_dim and ref_to_var are both + # positive then the rule with the greater value is + # tested first. + ref_to_var: int = 0 + # resolve_key: True if 'keys' have to be resolved in 'key1: value1 + # key2: value2 value3' or 'key1 key2' + resolve_key: bool = False + # resolve_value: True if 'values' have to be resolved in 'key1: + # value1 key2: value2 value3' + resolve_value: bool = False + # stop_at_local_apex: True if upward research in the hierarchy has + # to stop at local apex. + stop_at_local_apex: bool = False + # accept_standard_names: True if any standard name is valid in + # place of references (in which case no + # exception is raised if a reference cannot + # be resolved, and the standard name is + # used in place) + accept_standard_names: bool = False + # limit_to_scalar_coordinates: True if references to variables are + # only resolved if present as well in + # the 'coordinates' attributes of the + # variable, and they are scalar. + limit_to_scalar_coordinates: bool = False + + +# -------------------------------------------------------------------- +# Set the flattening rules for named CF attributes +# -------------------------------------------------------------------- +flattening_rules = { + attr.name: attr + for attr in ( + # ------------------------------------------------------------ + # Coordinates + # ------------------------------------------------------------ + FlatteningRules( + name="coordinates", + ref_to_var=1, + resolve_key=True, + stop_at_local_apex=True, + ), + FlatteningRules(name="bounds", ref_to_var=1, resolve_key=True), + FlatteningRules(name="climatology", ref_to_var=1, resolve_key=True), + # ------------------------------------------------------------ + # Cell methods + # ------------------------------------------------------------ + FlatteningRules( + name="cell_methods", + ref_to_dim=2, + ref_to_var=1, + resolve_key=True, + accept_standard_names=True, + limit_to_scalar_coordinates=True, + ), + # ------------------------------------------------------------ + # Cell measures + # ------------------------------------------------------------ + FlatteningRules( + name="cell_measures", ref_to_var=1, resolve_value=True + ), + # ------------------------------------------------------------ + # Coordinate references + # ------------------------------------------------------------ + FlatteningRules( + name="formula_terms", ref_to_var=1, resolve_value=True + ), + FlatteningRules( + name="grid_mapping", + ref_to_var=1, + resolve_key=True, + resolve_value=True, + ), + # ------------------------------------------------------------ + # Ancillary variables + # ------------------------------------------------------------ + FlatteningRules( + name="ancillary_variables", ref_to_var=1, resolve_key=True + ), + # ------------------------------------------------------------ + # Compression by gathering + # ------------------------------------------------------------ + FlatteningRules(name="compress", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ + # Discrete sampling geometries + # ------------------------------------------------------------ + FlatteningRules( + name="instance_dimension", ref_to_dim=1, resolve_key=True + ), + FlatteningRules( + name="sample_dimension", ref_to_dim=1, resolve_key=True + ), + # ------------------------------------------------------------ + # Domain variables + # ------------------------------------------------------------ + FlatteningRules(name="dimensions", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ + # Aggregation variables + # ------------------------------------------------------------ + FlatteningRules( + name="aggregated_dimensions", ref_to_dim=1, resolve_key=True + ), + FlatteningRules( + name="aggregated_data", ref_to_var=1, resolve_value=True + ), + # ------------------------------------------------------------ + # Cell geometries + # ------------------------------------------------------------ + FlatteningRules(name="geometry", ref_to_var=1, resolve_key=True), + FlatteningRules(name="interior_ring", ref_to_var=1, resolve_key=True), + FlatteningRules( + name="node_coordinates", ref_to_var=1, resolve_key=True + ), + FlatteningRules(name="node_count", ref_to_var=1, resolve_key=True), + FlatteningRules(name="nodes", ref_to_var=1, resolve_key=True), + FlatteningRules( + name="part_node_count", ref_to_var=1, resolve_key=True + ), + # ------------------------------------------------------------ + # UGRID variables + # ------------------------------------------------------------ + FlatteningRules(name="mesh", ref_to_var=1, resolve_key=True), + FlatteningRules( + name="edge_coordinates", ref_to_var=1, resolve_key=True + ), + FlatteningRules( + name="face_coordinates", ref_to_var=1, resolve_key=True + ), + FlatteningRules( + name="edge_node_connectivity", ref_to_var=1, resolve_key=True + ), + FlatteningRules( + name="face_node_connectivity", ref_to_var=1, resolve_key=True + ), + FlatteningRules( + name="face_face_connectivity", ref_to_var=1, resolve_key=True + ), + FlatteningRules( + name="edge_face_connectivity", ref_to_var=1, resolve_key=True + ), + FlatteningRules( + name="face_edge_connectivity", ref_to_var=1, resolve_key=True + ), + FlatteningRules(name="edge_dimension", ref_to_dim=1, resolve_key=True), + FlatteningRules(name="face_dimension", ref_to_dim=1, resolve_key=True), + # ------------------------------------------------------------ + # Compression by coordinate subsampling + # ------------------------------------------------------------ + FlatteningRules( + name="coordinate_interpolation", + ref_to_var=1, + resolve_key=True, + resolve_value=True, + ), + FlatteningRules( + name="tie_point_mapping", + ref_to_dim=2, + ref_to_var=1, + resolve_key=True, + resolve_value=True, + ), + FlatteningRules( + name="interpolation_parameters", ref_to_var=1, resolve_value=True + ), + ) +} diff --git a/cfdm/read_write/netcdf/flatten/flatten.py b/cfdm/read_write/netcdf/flatten/flatten.py new file mode 100644 index 0000000000..1c33a4d236 --- /dev/null +++ b/cfdm/read_write/netcdf/flatten/flatten.py @@ -0,0 +1,1702 @@ +"""Portions of this code were adapted from the `netcdf_flattener` +library, which carries the following Apache 2.0 License: + +Copyright (c) 2020 EUMETSAT + +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. The ASF licenses this file to you +under the Apache License, Version 2.0 (the "License"); you may not use +this file except in compliance with the License. You may obtain a copy +of the License at http://www.apache.org/licenses/LICENSE-2.0. + +""" + +import hashlib +import logging +import re +import warnings + +from .config import ( + flattener_attribute_map, + flattener_dimension_map, + flattener_separator, + flattener_variable_map, + flattening_rules, + group_separator, + max_name_len, + ref_not_found_error, +) + +# Mapping from numpy dtype endian format to that expected by netCDF4 +_dtype_endian_lookup = { + "=": "native", + ">": "big", + "<": "little", + "|": "native", + None: "native", +} + +# Set of netCDF attributes that contain references to dimensions or +# variables +referencing_attributes = set(flattening_rules) + + +def netcdf_flatten( + input_ds, + output_ds, + strict=True, + omit_data=False, + write_chunksize=134217728, +): + """Create a flattened version of a grouped netCDF dataset. + + **CF-netCDF coordinate variables** + + When a CF-netCDF coordinate variable in the input dataset is in a + different group to its corresponding dimension, the same variable + in the output flattened dataset will no longer be a CF-netCDF + coordinate variable, as its name will be prefixed with a different + group identifier than its dimension. + + In such cases it is up to the user to apply the proximal and + lateral search alogrithms to the flattened dataset returned by + `netcdf_flatten`, in conjunction with the mappings defined in the + newly created global attributes ``_flattener_variable_map`` and + ``_flattener_dimension_map``, to find which netCDF variables are + acting as CF coordinate variables in the flattened dataset. See + https://cfconventions.org/cf-conventions/cf-conventions.html#groups + for details. + + For example, if an input dataset has dimension ``lat`` in the root + group and coordinate variable ``lat(lat)`` in group ``/group1``, + then the flattened dataset will contain dimension ``lat`` and + variable ``group1__lat(lat)``, both in its root group. In this + case, the ``_flattener_variable_map`` global attribute of the + flattened dataset will contain the mapping ``'group1__lat: + /group1/lat'``, and the ``_flattener_dimension_map`` global + attribute will contain the mapping ``'lat: /lat'``. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + input_ds: + The dataset to be flattened, that has the same API as + `netCDF4.Dataset` or `h5netcdf.File`. + + output_ds: `netCDF4.Dataset` + A container for the flattened dataset. + + strict: `bool`, optional + If True, the default, then failing to resolve a reference + raises an exception. If False, a warning is issued and + flattening is continued. + + omit_data: `bool`, optional + If True then do not copy the data of any variables from + *input_ds* to *output_ds*. This does not affect the amount + of netCDF variables and dimensions that are written to the + file, nor the netCDF variables' attributes, but for all + variables it does not create data on disk or in + memory. The resulting dataset will be smaller than it + otherwise would have been, and when the new dataset is + accessed the data of these variables will be represented + by an array of all missing data. If False, the default, + then all data arrays are copied. + + write_chunksize: `int`, optional + When *omit_data* is False, the copying of data is done + piecewise to keep memory usage down. *write_chunksize* is + the size in bytes of how much data is copied from + *input_ds* to *output_ds* for each piece. Ignored if + *omit_data* is True. + + """ + _Flattener( + input_ds, + output_ds, + strict, + omit_data=omit_data, + write_chunksize=write_chunksize, + ).flatten() + + +def parse_attribute(name, attribute): + """Parse variable attribute of any form into a dict: + + * 'time' -> {'time': []} + * 'lat lon' -> {'lat': [], 'lon': []} + * 'area: time volume: lat lon' -> {'area': ['time'], 'volume': + ['lat', 'lon']} + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + name: `str` + The attribute name (e.g. ``'cell_methods'``). + + attribute: `str` + The attribute value to parse. + + :Returns: + + `dict` + The parsed string. + + """ + + def subst(s): + """Substitute tokens for WORD and SEP.""" + return s.replace("WORD", r"[A-Za-z0-9_#/.\(\)]+").replace( + "SEP", r"(\s+|$)" + ) + + # Regex for 'dict form': "k1: v1 v2 k2: v3" + pat_value = subst("(?PWORD)SEP") + pat_values = "({})*".format(pat_value) + pat_mapping = subst( + "(?PWORD):SEP(?P{})".format(pat_values) + ) + pat_mapping_list = "({})+".format(pat_mapping) + + # Regex for 'list form': "v1 v2 v3" (including single-item form) + pat_list_item = subst("(?PWORD)SEP") + pat_list = "({})+".format(pat_list_item) + + # Regex for any form: + pat_all = subst( + "((?P{})|(?P{}))$".format( + pat_list, pat_mapping_list + ) + ) + + m = re.match(pat_all, attribute) + + # Output is always a dict. If input form is a list, dict values + # are set as empty lists + out = {} + + if m is not None: + list_match = m.group("list") + # Parse as a list + if list_match: + for mapping in re.finditer(pat_list_item, list_match): + item = mapping.group("list_item") + out[item] = None + + # Parse as a dict: + else: + mapping_list = m.group("mapping_list") + for mapping in re.finditer(pat_mapping, mapping_list): + term = mapping.group("mapping_name") + values = [ + value.group("value") + for value in re.finditer( + pat_value, mapping.group("values") + ) + ] + out[term] = values + else: + raise AttributeParsingException( + f"Error parsing {name!r} attribute with value {attribute!r}" + ) + + return out + + +def generate_var_attr_str(d): + """Re-generate the attribute string from a dictionary. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + d: `dict` + A resolved and parsed attribute. + + :Returns: + + `str` + The flattened attribute value. + + """ + parsed_list = [] + for k, v in d.items(): + if v is None: + parsed_list.append(k) + elif not v: + parsed_list.append(f"{k}:") + else: + parsed_list.append(f"{k}: {' '.join(v)}") + + return " ".join(parsed_list) + + +class _Flattener: + """Information and methods needed to flatten a netCDF dataset. + + Contains the input file, the output file being flattened, and all + the logic of the flattening process. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + def __init__( + self, + input_ds, + output_ds, + strict=True, + omit_data=False, + write_chunksize=134217728, + ): + """**Initialisation** + + :Parameters: + + input_ds: + The dataset to be flattened, that has the same API as + `netCDF4.Dataset` or `h5netcdf.File`. + + output_ds: `netCDF4.Dataset` + A container for the flattened dataset. + + strict: `bool`, optional + See `netcdf_flatten`. + + omit_data: `bool`, optional + See `netcdf_flatten`. + + write_chunksize: `int`, optional + See `netcdf_flatten`. + + """ + self._attr_map_value = [] + self._dim_map_value = [] + self._var_map_value = [] + + self._dim_map = {} + self._var_map = {} + + self._input_ds = input_ds + self._output_ds = output_ds + self._strict = bool(strict) + self._omit_data = bool(omit_data) + self._write_chunksize = write_chunksize + + if ( + output_ds == input_ds + or output_ds.filepath() == self.filepath(input_ds) + or output_ds.data_model != "NETCDF4" + ): + raise ValueError( + "Invalid inputs. Input and output datasets should " + "be different, and output should be of the 'NETCDF4' format." + ) + + def attrs(self, variable): + """Return the variable attributes. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `dict` + A dictionary of the attribute values keyed by their + names. + + """ + try: + # h5netcdf + return dict(variable.attrs) + except AttributeError: + # netCDF4 + return { + attr: variable.getncattr(attr) for attr in variable.ncattrs() + } + + def chunksizes(self, variable): + """Return the variable chunk sizes. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `None` or sequence of `int` + The chunksizes, or `None` if the variable is not + chunked. + + **Examples** + + >>> f.chunksizes(variable) + [1, 324, 432] + + >>> f.chunksizes(variable) + None + + """ + try: + # netCDF4 + chunking = variable.chunking() + if chunking == "contiguous": + return None + + return chunking + except AttributeError: + # h5netcdf + return variable.chunks + + def contiguous(self, variable): + """Whether or not the variable data is contiguous on disk. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `bool` + `True` if the variable data is contiguous on disk, + otherwise `False`. + + **Examples** + + >>> f.contiguous(variable) + False + + """ + try: + # netCDF4 + return variable.chunking() == "contiguous" + except AttributeError: + # h5netcdf + return variable.chunks is None + + def dtype(self, variable): + """Return the data type of a variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `numpy.dtype` or `str` + The data type. + + **Examples** + + >>> f.dtype(variable) + dtype('>> f.dtype(variable) + str + + """ + out = variable.dtype + if out == "O": + out = str + + return out + + def endian(self, variable): + """Return the endian-ness of a variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + variable: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `str` + The endian-ness (``'little'``, ``'big'``, or + ``'native'``) of the variable. + + **Examples** + + >>> f.endian(variable) + 'native' + + """ + try: + # netCDF4 + return variable.endian() + except AttributeError: + # h5netcdf + dtype = variable.dtype + return _dtype_endian_lookup[getattr(dtype, "byteorder", None)] + + def filepath(self, dataset): + """Return the file path for the dataset. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dataset: + The dataset, that has the same API as + `netCDF4.Dataset` or `h5netcdf.File`. + + :Returns: + + `str` + The file system path, or the OPeNDAP URL, for the + dataset. + + **Examples** + + >>> f.filepath(dataset) + '/home/data/file.nc' + + """ + try: + # netCDF4 + return dataset.filepath() + except AttributeError: + # h5netcdf + return dataset.filename + + def get_dims(self, variable): + """Return the dimensions associated with a variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `list` + + """ + try: + # netCDF4 + return variable.get_dims() + except AttributeError: + # h5netcdf + dims = {} + dimension_names = list(variable.dimensions) + group = variable._parent + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim + dimension_names.remove(name) + + group = group.parent + while group is not None and dimension_names: + for name, dim in group.dims.items(): + if name in dimension_names: + dims[name] = dim + dimension_names.remove(name) + + group = group.parent + + return [dims[name] for name in variable.dimensions] + + def getncattr(self, x, attr): + """Retrieve a netCDF attribute. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + x: variable, group, or dataset + + attr: `str` + + :Returns: + + """ + try: + # netCDF4 + return getattr(x, attr) + except AttributeError: + # h5netcdf + return x.attrs[attr] + + def group(self, x): + """Return the group that a variable belongs to. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `Group` + + """ + try: + # netCDF4 + return x.group() + except AttributeError: + # h5netcdf + return x._parent + + def name(self, x): + """Return the netCDF name, without its groups. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `str` + + """ + out = x.name + if group_separator in out: + # h5netcdf + out = x.name.split(group_separator)[-1] + + return out + + def ncattrs(self, x): + """Return netCDF attribute names. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + x: variable, group, or dataset + + :Returns: + + `list` + + """ + try: + # netCDF4 + return x.ncattrs() + except AttributeError: + # h5netcdf + return list(x.attrs) + + def parent(self, group): + """Return a simulated unix parent group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `str` + + """ + try: + return group.parent + except AttributeError: + return + + def path(self, group): + """Return a simulated unix directory path to a group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `str` + + """ + try: + # netCDF4 + return group.path + except AttributeError: + # h5netcdf + try: + return group.name + except AttributeError: + return group_separator + + def flatten(self): + """Flattens and writes to output file. + + .. versionadded:: (cfdm) NEXTVERSION + + :Return: + + `None` + + """ + input_ds = self._input_ds + output_ds = self._output_ds + + logging.info(f"Flattening the groups of {self.filepath(input_ds)}") + + # Flatten product + self.process_group(input_ds) + + # Add name mapping attributes + output_ds.setncattr(flattener_attribute_map, self._attr_map_value) + output_ds.setncattr(flattener_dimension_map, self._dim_map_value) + output_ds.setncattr(flattener_variable_map, self._var_map_value) + + # Browse flattened variables to rename references: + logging.info( + " Browsing flattened variables to rename references " + "in attributes" + ) + for var in output_ds.variables.values(): + self.adapt_references(var) + + def process_group(self, input_group): + """Flattens a given group to the output file. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + input_group: `str` + The group to flatten. + + :Returns: + + `None` + + """ + logging.info(f" Browsing group {self.path(input_group)}") + + for attr_name in self.ncattrs(input_group): + self.flatten_attribute(input_group, attr_name) + + for dim in input_group.dimensions.values(): + self.flatten_dimension(dim) + + for var in input_group.variables.values(): + self.flatten_variable(var) + + for child_group in input_group.groups.values(): + self.process_group(child_group) + + def flatten_attribute(self, input_group, attr_name): + """Flattens a given attribute from a group to the output file. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + input_group: `str` + The group containing the attribute to flatten. + + attr_name: `str` + The name of the attribute. + + :Returns: + + `None` + + """ + logging.info( + f" Copying attribute {attr_name} from " + f"group {self.path(input_group)} to root" + ) + + # Create new name + new_attr_name = self.generate_flattened_name(input_group, attr_name) + + # Write attribute + self._output_ds.setncattr( + new_attr_name, self.getncattr(input_group, attr_name) + ) + + # Store new naming for later and in mapping attribute + self._attr_map_value.append( + self.generate_mapping_str(input_group, attr_name, new_attr_name) + ) + + def flatten_dimension(self, dim): + """Flattens a given dimension to the output file. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + dim: + The dimension to flatten, that has the same API as + `netCDF4.Dimension` or `h5netcdf.Dimension`. + + :Returns: + + `None` + + """ + logging.info( + f" Copying dimension {self.name(dim)} from " + f"group {self.path(self.group(dim))} to root" + ) + + # Create new name + new_name = self.generate_flattened_name( + self.group(dim), self.name(dim) + ) + + # Write dimension + self._output_ds.createDimension( + new_name, (len(dim), None)[dim.isunlimited()] + ) + + # Store new name in dict for resolving references later + self._dim_map[self.pathname(self.group(dim), self.name(dim))] = ( + new_name + ) + + # Add to name mapping attribute + self._dim_map_value.append( + self.generate_mapping_str( + self.group(dim), self.name(dim), new_name + ) + ) + + def flatten_variable(self, var): + """Flattens a given variable to the output file. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: + The variable, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `None` + + """ + logging.info( + f" Copying variable {self.name(var)} from " + f"group {self.path(self.group(var))} to root" + ) + + # Create new name + new_name = self.generate_flattened_name( + self.group(var), self.name(var) + ) + + # Replace old by new dimension names + new_dims = list( + map( + lambda x: self._dim_map[ + self.pathname(self.group(x), self.name(x)) + ], + self.get_dims(var), + ) + ) + + # Write variable + fullname = self.pathname(self.group(var), self.name(var)) + logging.info(f" Creating variable {new_name} from {fullname}") + + attributes = self.attrs(var) + + omit_data = self._omit_data + if omit_data: + fill_value = False + else: + fill_value = attributes.pop("_FillValue", None) + + new_var = self._output_ds.createVariable( + new_name, + self.dtype(var), + new_dims, + zlib=False, + complevel=4, + shuffle=True, + fletcher32=False, + contiguous=self.contiguous(var), + chunksizes=self.chunksizes(var), + endian=self.endian(var), + least_significant_digit=None, + fill_value=fill_value, + ) + + if not omit_data: + self.write_data_in_chunks(var, new_var) + + # Copy attributes + new_var.setncatts(attributes) + + # Store new name in dict for resolving references later + self._var_map[self.pathname(self.group(var), self.name(var))] = ( + new_name + ) + + # Add to name mapping attribute + self._var_map_value.append( + self.generate_mapping_str( + self.group(var), self.name(var), new_name + ) + ) + + # Resolve references in variable attributes and replace by + # absolute path + self.resolve_references(new_var, var) + + def increment_pos(self, pos, dim, copy_slice_shape, var_shape): + """Increment position. + + Increment position vector in a variable along a dimension by + the matching slice length along that dimension. If end of the + dimension is reached, recursively increment the next + dimensions until a valid position is found. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + pos: `list` + The current slice position along each dimension of the + array. + + dim: `int` + The position of the array dimension to be incremented. + + copy_slice_shape: `list` + The shape of the copy slice. + + var_shape: `tuple` + The shape of the whole variable. + + :Returns: + + `bool` + `True` if a valid position is found within the + variable, `False` otherwise. + + """ + # Try to increment dimension + pos[dim] += copy_slice_shape[dim] + + # Test new position + dim_end_reached = pos[dim] > var_shape[dim] + var_end_reached = (dim + 1) >= len(copy_slice_shape) + + # End of this dimension not reached yet + if not dim_end_reached: + return True + + # End of this dimension reached. Reset to 0 and try increment + # next one recursively + elif dim_end_reached and not var_end_reached: + pos[: dim + 1] = [0 for j in range(dim + 1)] + return self.increment_pos( + pos, dim + 1, copy_slice_shape, var_shape + ) + + else: + # End of this dimension reached, and no dimension to + # increment. Finish. + return False + + def write_data_in_chunks(self, old_var, new_var): + """Copy the data of a variable to a new one by slice. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + old_var: + The variable where the data should be copied from, + that has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. + + new_var: + The new variable in which to copy the data, that has the + same API as `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `None` + + """ + ndim = old_var.ndim + shape = old_var.shape + chunk_shape = ( + (self.write_chunksize // (old_var.dtype.itemsize * ndim)), + ) * ndim + + logging.info( + f" Copying {self.name(old_var)!r} data in chunks of " + f"{chunk_shape}" + ) + # Initial position vector + pos = [0] * ndim + + # Copy in slices until end reached + var_end_reached = False + while not var_end_reached: + # Create current slice + current_slice = tuple( + slice(pos[dim_i], min(shape[dim_i], pos[dim_i] + dim_l)) + for dim_i, dim_l in enumerate(chunk_shape) + ) + + # Copy data in slice + new_var[current_slice] = old_var[current_slice] + + # Get next position + var_end_reached = not self.increment_pos( + pos, 0, chunk_shape, shape + ) + + def resolve_reference(self, orig_ref, orig_var, rules): + """Resolve a reference. + + Resolves the absolute path to a coordinate variable within the + group structure. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + orig_ref: `str` + The reference to resolve. + + orig_var: + The original variable containing the reference, that + has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + :Returns: + + `str` + The absolute path to the reference. + + """ + ref = orig_ref + absolute_ref = None + ref_type = "" + + ref_to_dim = rules.ref_to_dim + ref_to_var = rules.ref_to_var + + # Resolve first as dim (True), or var (False) + resolve_dim_or_var = ref_to_dim > ref_to_var + + # Resolve var (resp. dim) if resolving as dim (resp. var) failed + resolve_alt = ref_to_dim and ref_to_var + + # Reference is already given by absolute path + if ref.startswith(group_separator): + method = "Absolute" + absolute_ref = ref + + # Reference is given by relative path + elif group_separator in ref: + method = "Relative" + + # First tentative as dim OR var + if resolve_dim_or_var: + ref_type = "dimension" + else: + ref_type = "variable" + + absolute_ref = self.search_by_relative_path( + orig_ref, self.group(orig_var), resolve_dim_or_var + ) + + # If failed and alternative possible, second tentative + if absolute_ref is None and resolve_alt: + if resolve_dim_or_var: + ref_type = "variable" + else: + ref_type = "dimension" + + absolute_ref = self.search_by_relative_path( + orig_ref, self.groupp(orig_var), not resolve_dim_or_var + ) + + # Reference is to be searched by proximity + else: + method = "Proximity" + absolute_ref, ref_type = self.resolve_reference_proximity( + ref, + resolve_dim_or_var, + resolve_alt, + orig_var, + rules, + ) + + # Post-search checks and return result + return self.resolve_reference_post_processing( + absolute_ref, + orig_ref, + orig_var, + rules, + ref_type, + method, + ) + + def resolve_reference_proximity( + self, ref, resolve_dim_or_var, resolve_alt, orig_var, rules + ): + """Resolve reference: search by proximity. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ref: `str` + The reference to resolve. + + resolve_dim_or_var: `bool` + Try to resolve first as dimension (True), or else as + variable (False). + + resolve_alt: `bool` + Resolve as variable if resolving as dimension failed, + and vice versa. + + orig_var: + The original variable containing the reference, that + has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + :Returns: + + (`str` or `None, str) + The resolved reference (or `None` if unresolved), and + the type of reference (either ``'dimension'`` or + ``'variable'``). + + """ + # First tentative as dim OR var + if resolve_dim_or_var: + ref_type = "dimension" + else: + ref_type = "variable" + + stop_at_local_apex = rules.stop_at_local_apex + + resolved_var = self.search_by_proximity( + ref, + self.group(orig_var), + resolve_dim_or_var, + False, + stop_at_local_apex, + ) + + # If failed and alternative possible, second tentative + if resolved_var is None and resolve_alt: + if resolve_dim_or_var: + ref_type = "variable" + else: + ref_type = "dimension" + + resolved_var = self.search_by_proximity( + ref, + self.group(orig_var), + not resolve_dim_or_var, + False, + stop_at_local_apex, + ) + + # If found, create ref string + if resolved_var is not None: + return ( + self.pathname( + self.group(resolved_var), self.name(resolved_var) + ), + ref_type, + ) + else: + return None, "" + + def resolve_reference_post_processing( + self, absolute_ref, orig_ref, orig_var, rules, ref_type, method + ): + """Post-processing operations after resolving reference. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + absolute_ref: `str` + The absolute path of the reference. + + orig_ref: `str` + The original reference. + + orig_var: + The original variable containing the reference, that + has the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + ref_type: `str` + the type of reference (either ``'dimension'`` or + ``'variable'``). + + method: `str` + The method of reference resolution (either + ``'proximity'`` or ``'absolute'``). + + :Returns: + + `str` + The absolute reference. + + """ + # If not found and accept standard name, assume standard name + if absolute_ref is None and rules.accept_standard_names: + logging.info( + f" Reference to {orig_ref!r} not " + "resolved. Assumed to be a standard name." + ) + ref_type = "standard_name" + absolute_ref = orig_ref + elif absolute_ref is None: + # Not found, so raise exception. + absolute_ref = self.handle_reference_error( + orig_ref, self.path(self.group(orig_var)) + ) + else: + # Found + logging.info( + f" {method} reference to {ref_type} " + f"{orig_ref!r} resolved as {absolute_ref!r}" + ) + + # If variables refs are limited to coordinate variable, + # additional check + if ( + ref_type == "variable" + and rules.limit_to_scalar_coordinates + and ( + ( + "coordinates" not in self.ncattrs(orig_var) + or orig_ref not in self.getncattr(orig_var, "coordinates") + ) + or self._input_ds[absolute_ref].ndim > 0 + ) + ): + logging.info( + f" Reference to {orig_ref!r} is not a " + "scalar coordinate variable. Assumed to be a standard name." + ) + absolute_ref = orig_ref + + # Return result + return absolute_ref + + def search_by_relative_path(self, ref, current_group, search_dim): + """Search by relative path. + + Resolves the absolute path to a reference within the group + structure, using search by relative path. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ref: `str` + The reference to resolve. + + current_group: `str` + The current group of the reference. + + search_dim: `bool` + If True then search for a dimension, otherwise a + variable. + + :Returns: + + `str` + The absolute path to the variable. + + """ + # Go up parent groups + while ref.startswith("../"): + if current_group.parent is None: + return None + + ref = ref[3:] + current_group = current_group.parent + + # Go down child groups + ref_split = ref.split(group_separator) + for g in ref_split[:-1]: + try: + current_group = current_group.groups[g] + except KeyError: + return None + + # Get variable or dimension + if search_dim: + elt = current_group.dimensions[ref_split[-1]] + else: + elt = current_group.variables[ref_split[-1]] + + # Get absolute reference + return self.pathname(self.group(elt), self.name(elt)) + + def search_by_proximity( + self, + ref, + current_group, + search_dim, + local_apex_reached, + is_coordinate_variable, + ): + """Search by proximity. + + Resolves the absolute path to a reference within the group + structure, using search by proximity. + + First search up in the hierarchy for the reference, until root + group is reached. If coordinate variable, search until local + apex is reached, then search down in siblings. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ref: `str` + The reference to resolve. + + current_group: + The current group where searching. + + search_dim: `bool` + If True then search for a dimension, otherwise a + variable. + + local_apex_reached: `bool` + Whether or not the apex has previously been reached. + + is_coordinate_variable: `bool` + Whether the search is for a coordiante variable. + + :Returns: + + `str` or `None` + The absolute path to the variable, if found, otherwise + `None`. + + """ + if search_dim: + dims_or_vars = current_group.dimensions + else: + dims_or_vars = current_group.variables + + # Found in current group + if ref in dims_or_vars.keys(): + return dims_or_vars[ref] + + local_apex_reached = ( + local_apex_reached or ref in current_group.dimensions.keys() + ) + + # Check if have to continue looking in parent group + # - normal search: continue until root is reached + # - coordinate variable: continue until local apex is reached + if is_coordinate_variable: + top_reached = local_apex_reached or current_group.parent is None + else: + top_reached = current_group.parent is None + + # Search up + if not top_reached: + return self.search_by_proximity( + ref, + current_group.parent, + search_dim, + local_apex_reached, + is_coordinate_variable, + ) + + elif is_coordinate_variable and local_apex_reached: + # Coordinate variable and local apex reached, so search + # down in siblings + found_elt = None + for child_group in current_group.groups.values(): + found_elt = self.search_by_proximity( + ref, + child_group, + search_dim, + local_apex_reached, + is_coordinate_variable, + ) + if found_elt is not None: + break + + return found_elt + + else: + # Did not find + return None + + def resolve_references(self, var, old_var): + """Resolve references. + + In a given variable, replace all references to other variables + in its attributes by absolute references. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: + The flattened variable in which references should be + renamed with absolute references, that has the same + API as `netCDF4.Variable` or `h5netcdf.Variable`. + + old_var: + The original variable (in group structure), that has + the same API as `netCDF4.Variable` or + `h5netcdf.Variable`. + + :Returns: + + `None` + + """ + var_attrs = self.attrs(var) + for name in referencing_attributes.intersection(var_attrs): + # Parse attribute value + parsed_attribute = parse_attribute(name, var_attrs[name]) + + # Resolved references in parsed as required by attribute + # properties + resolved_parsed_attr = {} + + rules = flattening_rules[name] + resolve_key = rules.resolve_key + resolve_value = rules.resolve_value + + for k, v in parsed_attribute.items(): + if resolve_key: + k = self.resolve_reference(k, old_var, rules) + + if resolve_value and v is not None: + v = [self.resolve_reference(x, old_var, rules) for x in v] + + resolved_parsed_attr[k] = v + + # Re-generate attribute value string with resolved + # references + var.setncattr(name, generate_var_attr_str(resolved_parsed_attr)) + + def adapt_references(self, var): + """Adapt references. + + In a given variable, replace all references to variables in + attributes by references to the new names in the flattened + netCDF. All references have to be already resolved as absolute + references. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: + The flattened variable in which references should be + renamed with new names, that has the same API as + `netCDF4.Variable` or `h5netcdf.Variable`. + + :Returns: + + `None` + + """ + var_attrs = self.attrs(var) + for name in referencing_attributes.intersection(var_attrs): + # Parse attribute value + value = var_attrs[name] + parsed_attribute = parse_attribute(name, value) + + adapted_parsed_attr = {} + + rules = flattening_rules[name] + resolve_key = rules.resolve_key + resolve_value = rules.resolve_value + + for k, v in parsed_attribute.items(): + if resolve_key: + k = self.adapt_name(k, rules) + + if resolve_value and v is not None: + v = [self.adapt_name(x, rules) for x in v] + + adapted_parsed_attr[k] = v + + new_attr_value = generate_var_attr_str(adapted_parsed_attr) + var.setncattr(name, new_attr_value) + + logging.info( + f" Value of {self.name(var)}.{name} changed " + f"from {value!r} to {new_attr_value!r}" + ) + + def adapt_name(self, resolved_ref, rules): + """Apapt the name. + + Return name of flattened reference. If not found, raise + exception or continue with a warning. + + .. versionadded:: (cfdm) NEXTVERSION + + resolved_ref: `str` + The resolved reference. + + rules: `FlatteningRules` + The flattening rules that apply to the reference. + + :Returns: + + `str` + The adapted reference. + + """ + # If ref contains Error message, leave as such + if ref_not_found_error in resolved_ref: + return resolved_ref + + ref_to_dim = rules.ref_to_dim + ref_to_var = rules.ref_to_var + + # Select highest priority map + if ref_to_dim > ref_to_var: + name_mapping = self._dim_map + + if ref_to_dim < ref_to_var: + name_mapping = self._var_map + + # Try to find mapping + try: + return name_mapping[resolved_ref] + + # If not found, look in other map if allowed + except KeyError: + if ref_to_dim and ref_to_var: + if ref_to_dim < ref_to_var: + name_mapping = self._dim_map + else: + name_mapping = self._var_map + + try: + return name_mapping[resolved_ref] + except KeyError: + pass + + # If still not found, check if any standard name is allowed + if rules.accept_standard_names: + return resolved_ref + + else: + # If not found, raise exception + return self.handle_reference_error(resolved_ref) + + def pathname(self, group, name): + """Compose full path name to an element in a group structure. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + current_group: + The group containing the dimension or variable. + + name: `str` + The name of the dimension or variable. + + :Returns: + + `str` + The absolute path to the dimension or variable + + """ + if self.parent(group) is None: + return group_separator + name + + return group_separator.join((self.path(group), name)) + + def generate_mapping_str(self, input_group, name, new_name): + """Generate string mapping. + + Generates a string representing the name mapping of an element + before and after flattening. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + input_group: + The group containing the non-flattened dimension or + variable. + + name: `str` + The name of the non-flattened dimension or variable. + + new_name: `str` + The name of the flattened dimension or variable. + + :Returns: + + `str` + A string representing the name mapping for the + dimension or variable. + + """ + original_pathname = self.pathname(input_group, name) + return f"{new_name}: {original_pathname}" + + def convert_path_to_valid_name(self, pathname): + """Generate valid name from path. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + pathname: `str` + The non-flattened namepath to a dimension or variable. + + new_name: `str` + A flattened version of *pathname*. + :Returns: + + `str` + The valid netCDF name. + + """ + return pathname.replace(group_separator, "", 1).replace( + group_separator, flattener_separator + ) + + def generate_flattened_name(self, input_group, orig_name): + """Convert full path of an element to a valid NetCDF name. + + * The name of an element is the concatenation of its + containing group and its name; + + * replaces ``/`` from paths (forbidden as NetCDF name); + + * if name is longer than 255 characters, replace path to group + by hash; + + * if name is still too long, replace complete name by hash. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + input_group: + The group containing the dimension or variable. + + orig_name: `str` + The original name of the dimension or variable. + + :Returns: + + `str` + The new valid name of the dimension or variable. + + """ + # If element is at root: no change + if self.parent(input_group) is None: + new_name = orig_name + + # If element in child group, concatenate group path and + # element name + else: + full_name = ( + self.convert_path_to_valid_name(self.path(input_group)) + + flattener_separator + + orig_name + ) + new_name = full_name + + # If resulting name is too long, hash group path + if len(new_name) >= max_name_len: + group_hash = hashlib.sha1( + self.path(input_group).encode("UTF-8") + ).hexdigest() + new_name = group_hash + flattener_separator + orig_name + + # If resulting name still too long, hash everything + if len(new_name) >= max_name_len: + new_name = hashlib.sha1( + full_name.encode("UTF-8") + ).hexdigest() + + return new_name + + def handle_reference_error(self, ref, context=None): + """Handle reference error. + + Depending on the `_strict` mode, either raise an exception or + log a warning. If not strict then a reference placeholder is + returned. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + ref: `str` + The reference + + context: `str` + Additional context information to add to message. + + :Returns: + + `str` + The error message, or if `_strict` is `True` then an + `UnresolvedReferenceException` is raised. + + """ + message = f"Reference {ref!r} could not be resolved" + if context is not None: + message = f"{message} from {context}" + + if self._strict: + raise UnresolvedReferenceException(message) + + warnings.warn(message) + return f"{ref_not_found_error}_{ref}" + + +class AttributeParsingException(Exception): + """Exception for unparsable attribute. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + pass + + +class UnresolvedReferenceException(Exception): + """Exception for unresolvable references in attributes. + + .. versionadded:: (cfdm) NEXTVERSION + + """ + + pass diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py index f419c603ae..b7bc16be52 100644 --- a/cfdm/read_write/netcdf/netcdfread.py +++ b/cfdm/read_write/netcdf/netcdfread.py @@ -9,26 +9,33 @@ from copy import deepcopy from dataclasses import dataclass, field from functools import reduce -from math import nan +from math import nan, prod from typing import Any from urllib.parse import urlparse from uuid import uuid4 +import h5netcdf import netCDF4 -import netcdf_flattener import numpy as np +from dask.base import tokenize from packaging.version import Version +from s3fs import S3FileSystem from ...decorators import _manage_log_level_via_verbosity -from ...functions import is_log_level_debug +from ...functions import is_log_level_debug, is_log_level_detail from .. import IORead +from .flatten import netcdf_flatten +from .flatten.config import ( + flattener_attribute_map, + flattener_dimension_map, + flattener_separator, + flattener_variable_map, +) logger = logging.getLogger(__name__) _cached_temporary_files = {} -_flattener_separator = netcdf_flattener._Flattener._Flattener__new_separator - @dataclass() class Mesh: @@ -449,22 +456,28 @@ def file_close(self): >>> r.file_close() """ - for nc in self.read_vars["datasets"]: + g = self.read_vars + + for nc in g["datasets"]: nc.close() # Close temporary flattened files - for flat_file in self.read_vars["flat_files"]: + for flat_file in g["flat_files"]: flat_file.close() # Close the original grouped file (v1.8.8.1) - if "nc_grouped" in self.read_vars: - self.read_vars["nc_grouped"].close() + if "nc_grouped" in g: + g["nc_grouped"].close() + + # Close s3fs.File objects + for f in g["s3fs_File_objects"]: + f.close() def file_open(self, filename, flatten=True, verbose=None): - """Open the netCDf file for reading. + """Open the netCDF file for reading. - If the file has hierarchical groups then a flattened version of it - is returned, and the original grouped file remains open. + If the file has hierarchical groups then a flattened version + of it is returned, and the original grouped file remains open. .. versionadded:: (cfdm) 1.7.0 @@ -490,16 +503,74 @@ def file_open(self, filename, flatten=True, verbose=None): >>> r.file_open('file.nc') """ - try: - nc = netCDF4.Dataset(filename, "r") - except RuntimeError as error: - raise RuntimeError(f"{error}: {filename}") + g = self.read_vars + + netcdf = False + hdf = False + netcdf_backend = g["netcdf_backend"] + + # Deal with a file in an S3 object store + u = urlparse(filename) + storage_options = self._get_storage_options(filename, u) + + if u.scheme == "s3": + # Create an openable S3 file object + fs_key = tokenize(("s3", storage_options)) + file_systems = g["file_systems"] + file_system = file_systems.get(fs_key) + if file_system is None: + # An S3 file system with these options does not exist, + # so create one. + file_system = S3FileSystem(**storage_options) + file_systems[fs_key] = file_system + + # Reset 'filename' to an s3fs.File object that can be + # passed to the netCDF backend + filename = file_system.open(u.path[1:], "rb") + g["s3fs_File_objects"].append(filename) + + if is_log_level_detail(logger): + logger.detail( + f" S3: s3fs.S3FileSystem options: {storage_options}\n" + ) # pragma: no cover + + if netcdf_backend is None: + try: + # Try opening the file with netCDF4 + nc = self._open_netCDF4(filename) + netcdf = True + except Exception: + # The file could not be read by netCDF4 so try opening + # it with h5netcdf + try: + nc = self._open_h5netcdf(filename) + hdf = True + except Exception as error: + raise error + + elif netcdf_backend == "netCDF4": + try: + nc = self._open_netCDF4(filename) + netcdf = True + except Exception as error: + raise error + + elif netcdf_backend == "h5netcdf": + try: + nc = self._open_h5netcdf(filename) + hdf = True + except Exception as error: + raise error + + else: + raise ValueError(f"Unknown netCDF backend: {netcdf_backend!r}") + + g["original_h5netcdf"] = hdf + g["original_netCDF4"] = netcdf # ------------------------------------------------------------ # If the file has a group structure then flatten it (CF>=1.8) # ------------------------------------------------------------ - g = self.read_vars - if flatten and nc.groups: # Create a diskless, non-persistent container for the # flattened file @@ -517,9 +588,7 @@ def file_open(self, filename, flatten=True, verbose=None): flat_nc.set_fill_off() # Flatten the file - netcdf_flattener.flatten( - nc, flat_nc, lax_mode=True, _copy_data=False - ) + netcdf_flatten(nc, flat_nc, strict=False, omit_data=True) # Store the original grouped file. This is primarily # because the unlimited dimensions in the flattened @@ -529,12 +598,64 @@ def file_open(self, filename, flatten=True, verbose=None): nc = flat_nc + netcdf = True + hdf = False + g["has_groups"] = True g["flat_files"].append(flat_file) + g["netCDF4"] = netcdf + g["h5netcdf"] = hdf g["nc"] = nc return nc + def _open_netCDF4(self, filename): + """Return an open `netCDF4.Dataset`. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + filename: `str` + The file to open + + :Returns: + + `netCDF4.Dataset` + + """ + return netCDF4.Dataset(filename, "r") + + def _open_h5netcdf(self, filename): + """Return an open `h5netcdf.File`. + + Uses values of the ``rdcc_nbytes``, ``rdcc_w0``, and + ``rdcc_nslots`` parameters to `h5netcdf.File` that correspond + to the default values of the `netCDF4.set_chunk_cache` + parameters ``size``, ``nelems``, and ``preemption``, + respectively. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + filename: `str` + The file to open + + :Returns: + + `h5netcdf.File` + + """ + return h5netcdf.File( + filename, + "r", + decode_vlen_strings=True, + rdcc_nbytes=16777216, + rdcc_w0=0.75, + rdcc_nslots=4133, + ) + @classmethod def cdl_to_netcdf(cls, filename): """Create a temporary netCDF-4 file from a CDL text file. @@ -605,7 +726,11 @@ def is_netcdf_file(cls, filename): """ # Assume that URLs are in netCDF format - if filename.startswith("https://") or filename.startswith("http://"): + if ( + filename.startswith("https://") + or filename.startswith("http://") + or filename.startswith("s3://") + ): return True # Read the magic number @@ -720,7 +845,7 @@ def is_file(cls, filename): """ # Assume that URLs are files u = urlparse(filename) - if u.scheme in ("http", "https"): + if u.scheme in ("http", "https", "s3"): return True return os.path.isfile(filename) @@ -781,9 +906,13 @@ def read( _scan_only=False, verbose=None, mask=True, + unpack=True, warnings=True, warn_valid=False, domain=False, + storage_options=None, + _file_systems=None, + netcdf_backend=None, ): """Reads a netCDF dataset from file or OPenDAP URL. @@ -818,6 +947,11 @@ def read( .. versionadded:: (cfdm) 1.8.2 + unpack: `bool`, optional + See `cfdm.read` for details + + .. versionadded:: (cfdm) NEXTVERSION + warn_valid: `bool`, optional See `cfdm.read` for details @@ -828,6 +962,21 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 + storage_options: `bool`, optional + See `cfdm.read` for details + + .. versionadded:: (cfdm) NEXTVERSION + + netcdf_backend: `None` or `str`, optional + See `cfdm.read` for details + + .. versionadded:: (cfdm) NEXTVERSION + + _file_systems: `dict`, optional + Provide any already-open S3 file systems. + + .. versionadded:: (cfdm) NEXTVERSION + :Returns: `list` @@ -838,6 +987,11 @@ def read( # Initialise netCDF read parameters # ------------------------------------------------------------ self.read_vars = { + # -------------------------------------------------------- + # Verbosity + # -------------------------------------------------------- + "debug": is_log_level_debug(logger), + # "new_dimension_sizes": {}, "formula_terms": {}, "compression": {}, @@ -888,8 +1042,9 @@ def read( "vertical_crs": {}, # "version": {}, - # Auto mask? + # Auto mask and unpack? "mask": bool(mask), + "unpack": bool(unpack), # Warn for the presence of valid_[min|max|range] # attributes? "warn_valid": bool(warn_valid), @@ -927,14 +1082,39 @@ def read( # CFA # -------------------------------------------------------- "cfa": False, + # -------------------------------------------------------- + # NetCDF backend + # -------------------------------------------------------- + "netcdf_backend": netcdf_backend, + # -------------------------------------------------------- + # S3 + # -------------------------------------------------------- + # Input file system storage options + "storage_options": storage_options, + # File system storage options for each file + "file_system_storage_options": {}, + # Cached s3fs.S3FileSystem objects + "file_systems": {}, + # Cache of open s3fs.File objects + "s3fs_File_objects": [], } g = self.read_vars + debug = g["debug"] + # Set versions for version in ("1.6", "1.7", "1.8", "1.9", "1.10", "1.11"): g["version"][version] = Version(version) + if storage_options is None: + g["storage_options"] = {} + + if _file_systems is not None: + # Update S3 file systems with those passed in as keyword + # parameter + g["file_systems"] = _file_systems + # ------------------------------------------------------------ # Add custom read vars # ------------------------------------------------------------ @@ -989,7 +1169,7 @@ def read( # ------------------------------------------------------------ nc = self.file_open(filename, flatten=True, verbose=None) logger.info(f"Reading netCDF file: {filename}\n") # pragma: no cover - if is_log_level_debug(logger): + if debug: logger.debug( f" Input netCDF dataset:\n {nc}\n" ) # pragma: no cover @@ -999,21 +1179,15 @@ def read( # 'global_attributes' dictionary # ---------------------------------------------------------------- global_attributes = {} - for attr in map(str, nc.ncattrs()): - try: - value = nc.getncattr(attr) - if isinstance(value, str): - try: - global_attributes[attr] = str(value) - except UnicodeEncodeError: - global_attributes[attr] = value.encode(errors="ignore") - else: - global_attributes[attr] = value - except UnicodeDecodeError: - pass + for attr, value in self._file_global_attributes(nc).items(): + attr = str(attr) + if isinstance(value, bytes): + value = value.decode(errors="ignore") + + global_attributes[attr] = value g["global_attributes"] = global_attributes - if is_log_level_debug(logger): + if debug: logger.debug( f" Global attributes:\n {g['global_attributes']}" ) # pragma: no cover @@ -1099,7 +1273,7 @@ def read( if has_groups: flattener_name_mapping_variables = getattr( - nc, "__flattener_name_mapping_variables", None + nc, flattener_variable_map, None ) if flattener_name_mapping_variables is not None: if isinstance(flattener_name_mapping_variables, str): @@ -1112,7 +1286,7 @@ def read( ) flattener_name_mapping_dimensions = getattr( - nc, "__flattener_name_mapping_dimensions", None + nc, flattener_dimension_map, None ) if flattener_name_mapping_dimensions is not None: if isinstance(flattener_name_mapping_dimensions, str): @@ -1131,7 +1305,7 @@ def read( flattener_dimensions[key] = value[1:] flattener_name_mapping_attributes = getattr( - nc, "__flattener_name_mapping_attributes", None + nc, flattener_attribute_map, None ) if flattener_name_mapping_attributes is not None: if isinstance(flattener_name_mapping_attributes, str): @@ -1157,22 +1331,22 @@ def read( group_attr = x[-1] flattener_attributes.setdefault(tuple(groups), {})[ group_attr - ] = nc.getncattr(flat_attr) + ] = self._file_global_attribute(nc, flat_attr) # Remove flattener attributes from the global attributes for attr in ( - "__flattener_name_mapping_variables", - "__flattener_name_mapping_dimensions", - "__flattener_name_mapping_attributes", + flattener_variable_map, + flattener_dimension_map, + flattener_attribute_map, ): g["global_attributes"].pop(attr, None) - for ncvar in nc.variables: + for ncvar in self._file_variables(nc): ncvar_basename = ncvar groups = () group_attributes = {} - variable = nc.variables[ncvar] + variable = self._file_variable(nc, ncvar) # -------------------------------------------------------- # Specify the group structure for each variable (CF>=1.8) @@ -1194,7 +1368,7 @@ def read( # structure that was prepended to the netCDF # variable name by the netCDF flattener. ncvar_basename = re.sub( - f"^{_flattener_separator.join(groups)}{_flattener_separator}", + f"^{flattener_separator.join(groups)}{flattener_separator}", "", ncvar_flat, ) @@ -1214,32 +1388,26 @@ def read( flattener_attributes[hierarchy] ) else: - # Remove the leading / from the absolute netCDF - # variable path + # Remove the leading / (slash) from the absolute + # netCDF variable path ncvar = ncvar[1:] flattener_variables[ncvar] = ncvar variable_grouped_dataset[ncvar] = g["nc_grouped"] variable_attributes[ncvar] = {} - for attr in map(str, variable.ncattrs()): - try: - variable_attributes[ncvar][attr] = variable.getncattr(attr) - if isinstance(variable_attributes[ncvar][attr], str): - try: - variable_attributes[ncvar][attr] = str( - variable_attributes[ncvar][attr] - ) - except UnicodeEncodeError: - variable_attributes[ncvar][attr] = ( - variable_attributes[ncvar][attr].encode( - errors="ignore" - ) - ) - except UnicodeDecodeError: - pass + for attr, value in self._file_variable_attributes( + variable + ).items(): + attr = str(attr) + if isinstance(value, bytes): + value = value.decode(errors="ignore") + + variable_attributes[ncvar][attr] = value - variable_dimensions[ncvar] = tuple(variable.dimensions) + variable_dimensions[ncvar] = tuple( + self._file_variable_dimensions(variable) + ) variable_dataset[ncvar] = nc variable_filename[ncvar] = g["filename"] variables[ncvar] = variable @@ -1248,9 +1416,9 @@ def read( variable_groups[ncvar] = groups variable_group_attributes[ncvar] = group_attributes - # Populate dimensions_groups abd dimension_basename + # Populate dimensions_groups and dimension_basename # dictionaries - for ncdim in nc.dimensions: + for ncdim in self._file_dimensions(nc): ncdim_org = ncdim ncdim_basename = ncdim groups = () @@ -1267,7 +1435,7 @@ def read( if groups: # This dimension is in a group. ncdim_basename = re.sub( - "^{_flattener_separator.join(groups)}{_flattener_separator}", + "^{flattener_separator.join(groups)}{flattener_separator}", "", ncdim_flat, ) @@ -1275,9 +1443,9 @@ def read( dimension_groups[ncdim] = groups dimension_basename[ncdim] = ncdim_basename - dimension_isunlimited[ncdim] = nc.dimensions[ - ncdim_org - ].isunlimited() + dimension_isunlimited[ncdim] = self._file_dimension_isunlimited( + nc, ncdim_org + ) if has_groups: variable_dimensions = { @@ -1285,7 +1453,7 @@ def read( for name, value in variable_dimensions.items() } - if is_log_level_debug(logger): + if debug: logger.debug( " General read variables:\n" " read_vars['variable_dimensions'] =\n" @@ -1325,7 +1493,7 @@ def read( # The netCDF dimensions of the parent file internal_dimension_sizes = {} - for name, dimension in nc.dimensions.items(): + for name, dimension in self._file_dimensions(nc).items(): if ( has_groups and dimension_isunlimited[flattener_dimensions[name]] @@ -1398,7 +1566,7 @@ def read( # '/forecasts/model/t': 't'} g["dimension_basename"] = dimension_basename - if is_log_level_debug(logger): + if debug: logger.debug( " read_vars['dimension_isunlimited'] =\n" f" {g['dimension_isunlimited']}\n" @@ -1554,7 +1722,7 @@ def read( # node coordinate variable g["do_not_create_field"].add(geometry_ncvar) - if is_log_level_debug(logger): + if debug: logger.debug( " Compression read vars:\n" " read_vars['compression'] =\n" @@ -1604,7 +1772,7 @@ def read( # location_index_set self._ugrid_parse_location_index_set(attributes) - if is_log_level_debug(logger): + if debug: logger.debug(f" UGRID meshes:\n {g['mesh']}") if _scan_only: @@ -1688,7 +1856,7 @@ def read( }, ) - if is_log_level_debug(logger): + if debug: logger.debug( " Reference read vars:\n" " read_vars['references'] =\n" @@ -2045,8 +2213,14 @@ def _get_variables_from_external_files(self, netcdf_external_variables): "\nScanning external file:\n-----------------------" ) # pragma: no cover + # Note: We pass in the s3 file system (if any) of the + # parent file in case we can resuse it for the + # external file external_read_vars = self.read( - external_file, _scan_only=True, verbose=verbose + external_file, + _scan_only=True, + _file_systems=read_vars["file_systems"], + verbose=verbose, ) logger.info( @@ -2284,12 +2458,13 @@ def _parse_indexed_contiguous_compression( """ g = self.read_vars + debug = g["debug"] profile_dimension = g["compression"][sample_dimension][ "ragged_contiguous" ]["profile_dimension"] - if is_log_level_debug(logger): + if debug: logger.debug( " Pre-processing indexed and contiguous compression " f"for instance dimension: {instance_dimension}\n" @@ -2311,6 +2486,7 @@ def _parse_indexed_contiguous_compression( elements_per_profile = contiguous["count_variable"] instance_dimension_size = indexed["instance_dimension_size"] + element_dimension_1_size = int(profiles_per_instance.max()) element_dimension_2_size = int( self.implementation.get_data_maximum(elements_per_profile) @@ -2331,7 +2507,7 @@ def _parse_indexed_contiguous_compression( del g["compression"][sample_dimension]["ragged_contiguous"] - if is_log_level_debug(logger): + if debug: logger.debug( f" Created read_vars['compression'][{sample_dimension!r}]" "['ragged_indexed_contiguous']\n" @@ -2479,7 +2655,7 @@ def _parse_geometry(self, parent_ncvar, attributes): # variable in this case. # -------------------------------------------------------- nodes_per_geometry = self.implementation.initialise_Count() - size = g["nc"].dimensions[node_dimension].size + size = self._file_dimension_size(g["nc"], node_dimension) ones = self.implementation.initialise_Data( array=np.ones((size,), dtype="int32"), copy=False ) @@ -2572,7 +2748,11 @@ def _parse_geometry(self, parent_ncvar, attributes): for cell_no in range( self.implementation.get_data_size(nodes_per_geometry) ): - n_nodes_in_this_cell = int(nodes_per_geometry_data[cell_no]) + n_nodes_in_this_cell = int( + self.implementation.get_array( + nodes_per_geometry_data[cell_no] + )[0] + ) # Initialise partial_node_count, a running count of # how many nodes there are in this geometry @@ -2580,7 +2760,9 @@ def _parse_geometry(self, parent_ncvar, attributes): for k in range(i, total_number_of_parts): index.data[k] = instance_index - n_nodes += int(parts_data[k]) + n_nodes += int( + self.implementation.get_array(parts_data[k])[0] + ) if n_nodes >= n_nodes_in_this_cell: instance_index += 1 i += k + 1 @@ -2777,7 +2959,7 @@ def _set_ragged_indexed_parameters( "element_dimension_size": element_dimension_size, } - if is_log_level_debug(logger): + if g["debug"]: logger.debug( " Created " f"read_vars['compression'][{indexed_sample_dimension!r}]['ragged_indexed']" @@ -3245,7 +3427,7 @@ def _create_field_or_domain( field_properties.update(g["variable_attributes"][field_ncvar]) - if is_log_level_debug(logger): + if g["debug"]: logger.debug( " netCDF attributes:\n" f" {field_properties}" @@ -3458,7 +3640,6 @@ def _create_field_or_domain( ) # Set unlimited status of axis - # if nc.dimensions[ncdim].isunlimited(): if g["dimension_isunlimited"][ncdim]: self.implementation.nc_set_unlimited_axis(f, axis) @@ -3484,7 +3665,6 @@ def _create_field_or_domain( # Set unlimited status of axis try: - # if nc.dimensions[ncdim].isunlimited(): if g["dimension_isunlimited"][ncdim]: self.implementation.nc_set_unlimited_axis(f, axis) except KeyError: @@ -4633,7 +4813,7 @@ def _is_char_or_string(self, ncvar): """ datatype = self.read_vars["variables"][ncvar].dtype - return datatype == str or datatype.kind in "SU" + return datatype == str or datatype.kind in "OSU" def _is_char(self, ncvar): """Return True if the netCDf variable has char datatype. @@ -5926,18 +6106,18 @@ def _create_netcdfarray( return_kwargs_only: `bool`, optional Only return the kwargs dictionary, without - instantiating a new `NetCDFArray`. + instantiating a new `NetCDF4Array` or `H5netcdfArray`. .. versionadded:: (cfdm) 1.10.0.1 :Returns: - (`NetCDFArray`, `dict`) or (`None`, `dict`) or `dict` - The new `NetCDFArray` instance and a dictionary of the - kwargs used to create it. If the array could not be - created then `None` is returned in its place. If - *return_kwargs_only* then only the dictionary is - returned. + (array, `dict`) or (`None`, `dict`) or `dict` + The new `NetCDF4Array` or `H5netcdfArray` instance and + a dictionary of the kwargs used to create it. If the + array could not be created then `None` is returned in + its place. If *return_kwargs_only* then only the + dictionary is returned. """ g = self.read_vars @@ -5957,7 +6137,7 @@ def _create_netcdfarray( return None dtype = variable.dtype - if dtype is str: + if dtype is str or dtype.kind == "O": # netCDF string types have a dtype of `str`, which needs # to be reset as a numpy.dtype, but we don't know what # without reading the data, so set it to None for now. @@ -5968,7 +6148,7 @@ def _create_netcdfarray( ndim = variable.ndim shape = variable.shape - size = variable.size + size = self._file_variable_size(variable) if size < 2: size = int(size) @@ -5983,40 +6163,21 @@ def _create_netcdfarray( filename = g["variable_filename"][ncvar] - # Get the units and calendar (before we overwrite ncvar) - units = g["variable_attributes"][ncvar].get("units") - calendar = g["variable_attributes"][ncvar].get("calendar") - + attributes = g["variable_attributes"][ncvar].copy() if coord_ncvar is not None: # Get the Units from the parent coordinate variable, if # they've not already been set. - if units is None: + if "units" not in attributes: units = g["variable_attributes"][coord_ncvar].get("units") + if units is not None: + attributes["units"] = units - if calendar is None: + if "calendar" not in attributes: calendar = g["variable_attributes"][coord_ncvar].get( "calendar" ) - - # Store the missing value indicators - missing_values = {} - for attr in ( - "missing_value", - "_FillValue", - "valid_min", - "valid_max", - "valid_range", - ): - value = getattr(variable, attr, None) - if value is not None: - missing_values[attr] = value - - valid_range = missing_values.get("valid_range") - if valid_range is not None: - try: - missing_values["valid_range"] = tuple(valid_range) - except TypeError: - pass + if calendar is not None: + attributes["calendar"] = calendar kwargs = { "filename": filename, @@ -6024,15 +6185,19 @@ def _create_netcdfarray( "shape": shape, "dtype": dtype, "mask": g["mask"], - "units": units, - "calendar": calendar, - "missing_values": missing_values, + "unpack": g["unpack"], + "attributes": attributes, + "storage_options": g["file_system_storage_options"].get(filename), } if return_kwargs_only: return kwargs - array = self.implementation.initialise_NetCDFArray(**kwargs) + if g["original_netCDF4"]: + array = self.implementation.initialise_NetCDF4Array(**kwargs) + else: + # h5netcdf + array = self.implementation.initialise_H5netcdfArray(**kwargs) return array, kwargs @@ -6087,8 +6252,10 @@ def _create_data( return None filename = kwargs["filename"] - units = kwargs["units"] - calendar = kwargs["calendar"] + + attributes = kwargs["attributes"] + units = attributes.get("units") + calendar = attributes.get("calendar") compression = g["compression"] @@ -7131,7 +7298,7 @@ def _create_gathered_array( :Parameters: - gathered_array: `NetCDFArray` + gathered_array: `NetCDF4Array` or `H5netcdfArray` compressed_dimensions: sequence of `int` The position of the compressed dimension in the @@ -7182,14 +7349,9 @@ def _create_ragged_contiguous_array( `RaggedContiguousArray` """ - # uncompressed_ndim = len(uncompressed_shape) - # uncompressed_size = int(reduce(operator.mul, uncompressed_shape, 1)) - return self.implementation.initialise_RaggedContiguousArray( compressed_array=ragged_contiguous_array, - # ndim=uncompressed_ndim, shape=uncompressed_shape, - # size=uncompressed_size, count_variable=count_variable, ) @@ -7208,14 +7370,9 @@ def _create_ragged_indexed_array( `RaggedIndexedArray` """ - # uncompressed_ndim = len(uncompressed_shape) - # uncompressed_size = int(reduce(operator.mul, uncompressed_shape, 1)) - return self.implementation.initialise_RaggedIndexedArray( compressed_array=ragged_indexed_array, - # ndim=uncompressed_ndim, shape=uncompressed_shape, - # size=uncompressed_size, index_variable=index_variable, ) @@ -7235,14 +7392,9 @@ def _create_ragged_indexed_contiguous_array( `RaggedIndexedContiguousArray` """ - # uncompressed_ndim = len(uncompressed_shape) - # uncompressed_size = int(reduce(operator.mul, uncompressed_shape, 1)) - return self.implementation.initialise_RaggedIndexedContiguousArray( compressed_array=ragged_indexed_contiguous_array, - # ndim=uncompressed_ndim, shape=uncompressed_shape, - # size=uncompressed_size, count_variable=count_variable, index_variable=index_variable, ) @@ -7748,7 +7900,7 @@ def _check_ancillary_variables(self, field_ncvar, string, parsed_string): # Though an error of sorts, set as debug level message; # read not terminated - if is_log_level_debug(logger): + if g["debug"]: logger.debug( f" Error processing netCDF variable {field_ncvar}: " f"{d['reason']}" @@ -9121,10 +9273,11 @@ def _ugrid_create_domain_topology(self, parent_ncvar, f, mesh, location): copy=False, **{connectivity_attr: indices}, ) + attributes = kwargs["attributes"] data = self._create_Data( array, - units=kwargs["units"], - calendar=kwargs["calendar"], + units=attributes.get("units"), + calendar=attributes.get("calendar"), ncvar=connectivity_ncvar, ) else: @@ -9226,10 +9379,11 @@ def _ugrid_create_cell_connectivities( cell_dimension=cell_dimension, copy=False, ) + attributes = kwargs["attributes"] data = self._create_Data( array, - units=kwargs["units"], - calendar=kwargs["calendar"], + units=attributes.get("units"), + calendar=attributes.get("calendar"), ncvar=connectivity_ncvar, ) @@ -9883,3 +10037,273 @@ def _ugrid_check_connectivity_variable( ok = False return ok + + def _file_global_attribute(self, nc, attr): + """Return a global attribute from a dataset. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + attr: `str` + The global attribute name. + + :Returns: + + The global attribute value. + + """ + try: + # netCDF4 + return nc.getncattr(attr) + except AttributeError: + # h5netcdf + return nc.attrs[attr] + + def _file_global_attributes(self, nc): + """Return the global attributes from a dataset. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + :Returns: + + `dict`-like + A dictionary of the attribute values keyed by their + names. + + """ + try: + # h5netcdf + return nc.attrs + except AttributeError: + # netCDF4 + return {attr: nc.getncattr(attr) for attr in nc.ncattrs()} + + def _file_dimensions(self, nc): + """Return all dimensions in the root group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Returns: + + `dict`-like + A dictionary of the dimensions keyed by their names. + + """ + return nc.dimensions + + def _file_dimension(self, nc, dim_name): + """Return a dimension from the root group of a dataset. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + dim_name: `str` + The dimension name. + + :Returns: + + `netCDF.Dimension` or `h5netcdf.Dimension` + The dimension. + + """ + return self._file_dimensions(nc)[dim_name] + + def _file_dimension_isunlimited(self, nc, dim_name): + """Return whether a dimension is unlimited. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + dim_name: `str` + The dimension name. + + :Returns: + + `bool` + Whether the dimension is unlimited. + + """ + return self._file_dimension(nc, dim_name).isunlimited() + + def _file_dimension_size(self, nc, dim_name): + """Return a dimension's size. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + dim_name: `str` + The dimension name. + + :Returns: + + `int` + The dimension size. + + """ + return self._file_dimension(nc, dim_name).size + + def _file_variables(self, nc): + """Return all variables in the root group. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + :Returns: + + `dict`-like + A dictionary of the variables keyed by their names. + + """ + return nc.variables + + def _file_variable(self, nc, var_name): + """Return a variable. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + nc: `netCDF4.Dataset` or `h5netcdf.File` + The dataset. + + var_name: `str` + The variable name. + + :Returns: + + `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + """ + return self._file_variables(nc)[var_name] + + def _file_variable_attributes(self, var): + """Return the variable attributes. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `dict` + A dictionary of the attribute values keyed by their + names. + + """ + try: + # h5netcdf + return dict(var.attrs) + except AttributeError: + # netCDF4 + return {attr: var.getncattr(attr) for attr in var.ncattrs()} + + def _file_variable_dimensions(self, var): + """Return the variable dimension names. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `tuple` or `str` + The dimension names. + + """ + return var.dimensions + + def _file_variable_size(self, var): + """Return the size of a variable's array. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + var: `netCDF4.Variable` or `h5netcdf.Variable` + The variable. + + :Returns: + + `int` + The array size. + + """ + # Use try/except here because the variable type could differ + # from that implied by the value of self.read_vars["netCDF4"] + try: + # netCDF4 + return var.size + except AttributeError: + # h5netcdf + return prod(var.shape) + + def _get_storage_options(self, filename, parsed_filename): + """Get the storage options for accessing a file. + + If returned storage options will always include an + ``'endpoint_url'`` key. + + .. versionadded:: (cfdm) NEXTVERSION + + :Parameters: + + filename: `str` + The name of the file. + + parsed_filename: `urllib.parse.ParseResult` + The parsed file name, as returned by + ``urllib.parse.urlparse(filename)``. + + :Returns: + + `dict` + The storage options for accessing the file. + + """ + g = self.read_vars + storage_options = g["storage_options"].copy() + + client_kwargs = storage_options.get("client_kwargs", {}) + if ( + "endpoint_url" not in storage_options + and "endpoint_url" not in client_kwargs + ): + storage_options["endpoint_url"] = ( + f"https://{parsed_filename.netloc}" + ) + + g["file_system_storage_options"].setdefault(filename, storage_options) + + return storage_options diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py index b963458789..331b22df60 100644 --- a/cfdm/read_write/read.py +++ b/cfdm/read_write/read.py @@ -16,18 +16,22 @@ def read( warnings=False, warn_valid=False, mask=True, + unpack=True, domain=False, + netcdf_backend=None, + storage_options=None, _implementation=_implementation, ): """Read field or domain constructs from a dataset. - The dataset may be a netCDF file on disk or on an OPeNDAP server, - or a CDL file on disk (see below). + The following file formats are supported: netCDF and CDL. + + NetCDF files may be on local disk, on an OPeNDAP server, or in an + S3 object store. The returned constructs are sorted by the netCDF variable names of their corresponding data or domain variables. - **CDL files** A file is considered to be a CDL representation of a netCDF @@ -229,10 +233,10 @@ def read( If True (the default) then mask by convention the data of field and metadata constructs. - The masking by convention of a netCDF array depends on the - values of any of the netCDF variable attributes - ``_FillValue``, ``missing_value``, ``valid_min``, - ``valid_max`` and ``valid_range``. + A netCDF array is masked depending on the values of any of + the netCDF attributes ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_min``, ``valid_max``, and + ``valid_range``. See https://ncas-cms.github.io/cfdm/tutorial.html#data-mask @@ -240,6 +244,16 @@ def read( .. versionadded:: (cfdm) 1.8.2 + unpack: `bool` + If True, the default, then unpack arrays by convention + when the data is read from disk. + + Unpacking is determined by netCDF conventions for the + following variable attributes: ``add_offset``, + ``scale_factor``, and ``_Unsigned``. + + .. versionadded:: (cfdm) NEXTVERSION + domain: `bool`, optional If True then return only the domain constructs that are explicitly defined by CF-netCDF domain variables, ignoring @@ -262,6 +276,58 @@ def read( .. versionadded:: (cfdm) 1.9.0.0 + netcdf_eninge: `None` or `str`, optional + Specify which library to use for opening and reading + netCDF files. By default, or if `None`, then the first one + of `netCDF4` and `h5netcdf` to successfully open the + netCDF file is used. Setting *netcdf_backend* to one of + ``'netCDF4'`` and ``'h5netcdf'`` will force the use of + that library. + + .. versionadded:: (cfdm) NEXTVERSION + + storage_options: `dict` or `None`, optional + Pass parameters to the backend file system driver, such as + username, password, server, port, etc. How the storage + options are interpreted depends on the location of the + file: + + * **Local File System**: Storage options are ignored for + local files. + + * **HTTP(S)**: Storage options are ignored for files + available across the network via OPeNDAP. + + * **S3-compatible services**: The backend used is `s3fs`, + and the storage options are used to initialise an + `s3fs.S3FileSystem` file system object. By default, or + if `None`, then *storage_options* is taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options*, nor in a dictionary defined by the + ``'client_kwargs'`` key (both of which are the case when + *storage_options* is `None`), then one will be + automatically inserted for accessing an S3 file. For + instance, with a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. To + disable this, set the ``'endpoint_url'`` key to `None`. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``None``, ``{}``, + ``{'endpoint_url': 'https://store'}``, and + ``{'client_kwargs': {'endpoint_url': + 'https://store'}}`` + + *Parameter example:* + ``{'key': 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + .. versionadded:: (cfdm) NEXTVERSION + _implementation: (subclass of) `CFDMImplementation`, optional Define the CF data model implementation that provides the returned field constructs. @@ -321,9 +387,10 @@ def read( filename = netcdf.cdl_to_netcdf(filename) if netcdf.is_netcdf_file(filename): - # See https://github.com/NCAS-CMS/cfdm/issues/128 for context on the - # try/except here, which acts as a temporary fix pending decisions on - # the best way to handle CDL with only header or coordinate info. + # See https://github.com/NCAS-CMS/cfdm/issues/128 for context + # on the try/except here, which acts as a temporary fix + # pending decisions on the best way to handle CDL with only + # header or coordinate info. try: fields = netcdf.read( filename, @@ -333,7 +400,10 @@ def read( warnings=warnings, warn_valid=warn_valid, mask=mask, + unpack=unpack, domain=domain, + storage_options=storage_options, + netcdf_backend=netcdf_backend, extra_read_vars=None, ) except MaskError: diff --git a/cfdm/read_write/write.py b/cfdm/read_write/write.py index 23ed36c41f..ff5d1eea31 100644 --- a/cfdm/read_write/write.py +++ b/cfdm/read_write/write.py @@ -502,7 +502,7 @@ def write( variables' attributes, but does not create data on disk for the requested variables. The resulting file will be smaller than it otherwise would have been, and when the - new file is read then the data of these variables will be + new file is read the data of these variables will be represented by an array of all missing data. The *omit_data* parameter may be one, or a sequence, of: diff --git a/cfdm/test/test_NetCDF4Array.py b/cfdm/test/test_NetCDF4Array.py new file mode 100644 index 0000000000..bc42e587ff --- /dev/null +++ b/cfdm/test/test_NetCDF4Array.py @@ -0,0 +1,196 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest +from urllib.parse import urlparse + +faulthandler.enable() # to debug seg faults and timeouts + +import numpy as np + +import cfdm + +n_tmpfiles = 1 +tmpfiles = [ + tempfile.mkstemp("_test_netCDF.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +(tmpfile,) = tmpfiles + + +def _remove_tmpfiles(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + + +class NetCDF4ArrayTest(unittest.TestCase): + """Unit test for the NetCDF4Array class.""" + + def setUp(self): + """Preparations called immediately before each test method.""" + # Disable log messages to silence expected warnings + cfdm.log_level("DISABLE") + # Note: to enable all messages for given methods, lines or + # calls (those without a 'verbose' option to do the same) + # e.g. to debug them, wrap them (for methods, start-to-end + # internally) as follows: + # + # cfdm.LOG_LEVEL('DEBUG') + # < ... test code ... > + # cfdm.log_level('DISABLE') + + def test_NetCDF4Array_get_addresses(self): + """Test `NetCDF4Array.get_addresses`""" + a = cfdm.NetCDF4Array(address="tas") + self.assertEqual(a.get_addresses(), ("tas",)) + + a = cfdm.NetCDF4Array(address=("tas1", "tas1")) + self.assertEqual(a.get_addresses(), ("tas1", "tas1")) + + a = cfdm.NetCDF4Array() + self.assertEqual(a.get_addresses(), ()) + + def test_NetCDF4Array_get_filenames(self): + """Test `NetCDF4Array.get_filenames`""" + a = cfdm.NetCDF4Array("/data1/file1") + self.assertEqual(a.get_filenames(), ("/data1/file1",)) + + a = cfdm.NetCDF4Array(("/data1/file1",)) + self.assertEqual(a.get_filenames(), ("/data1/file1",)) + + a = cfdm.NetCDF4Array(("/data1/file1", "/data2/file2")) + self.assertEqual(a.get_filenames(), ("/data1/file1", "/data2/file2")) + + a = cfdm.NetCDF4Array() + self.assertEqual(a.get_filenames(), ()) + + def test_NetCDF4Array_mask(self): + """Test NetCDF4Array masking.""" + f = cfdm.example_field(0) + f.data[0] = np.ma.masked + cfdm.write(f, tmpfile) + array = f.array + + n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) + self.assertTrue(n.get_mask()) + n = n[...] + self.assertTrue((array.mask == n.mask).all()) + + n = cfdm.NetCDF4Array( + tmpfile, f.nc_get_variable(), shape=f.shape, mask=False + ) + self.assertFalse(n.get_mask()) + n = n[...] + self.assertEqual(np.ma.count(n), n.size) + + def test_NetCDF4Array_unpack(self): + """Test NetCDF4Array unpacking.""" + add_offset = 10.0 + scale_factor = 3.14 + + f = cfdm.example_field(0) + f.data[0] = np.ma.masked + array0 = f.array + array1 = (array0 - add_offset) / scale_factor + + f.set_property("add_offset", add_offset) + f.set_property("scale_factor", scale_factor) + cfdm.write(f, tmpfile) + + n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) + self.assertTrue(n.get_unpack()) + n = n[...] + self.assertTrue((n.mask == array0.mask).all()) + self.assertTrue(np.ma.allclose(n, array0)) + + n = cfdm.NetCDF4Array( + tmpfile, f.nc_get_variable(), shape=f.shape, unpack=False + ) + self.assertFalse(n.get_unpack()) + n = n[...] + self.assertTrue((n.mask == array1.mask).all()) + self.assertTrue((n == array1).all()) + + def test_NetCDF4Array_get_storage_options(self): + """Test NetCDF4Array get_storage_options.""" + n = cfdm.NetCDF4Array(filename="filename.nc") + self.assertEqual(n.get_storage_options(), {}) + + n = cfdm.NetCDF4Array( + filename="filename.nc", storage_options={"anon": True} + ) + self.assertEqual(n.get_storage_options(), {"anon": True}) + + n = cfdm.NetCDF4Array(filename="s3://store/filename.nc") + self.assertEqual( + n.get_storage_options(), {"endpoint_url": "https://store"} + ) + self.assertEqual(n.get_storage_options(create_endpoint_url=False), {}) + + n = cfdm.NetCDF4Array( + filename="s3://store/filename.nc", storage_options={"anon": True} + ) + self.assertEqual( + n.get_storage_options(), + {"anon": True, "endpoint_url": "https://store"}, + ) + self.assertEqual( + n.get_storage_options(create_endpoint_url=False), {"anon": True} + ) + other_file = "s3://other/file.nc" + self.assertEqual( + n.get_storage_options(filename=other_file), + {"anon": True, "endpoint_url": "https://other"}, + ) + self.assertEqual( + n.get_storage_options(parsed_filename=urlparse(other_file)), + {"anon": True, "endpoint_url": "https://other"}, + ) + + n = cfdm.NetCDF4Array( + filename="s3://store/filename.nc", + storage_options={"anon": True, "endpoint_url": None}, + ) + self.assertEqual( + n.get_storage_options(), + {"anon": True, "endpoint_url": None}, + ) + + def test_NetCDF4Array_get_attributes(self): + """Test NetCDF4Array get_attributes.""" + f = cfdm.example_field(0) + cfdm.write(f, tmpfile) + n = cfdm.NetCDF4Array(tmpfile, f.nc_get_variable(), shape=f.shape) + self.assertIsNone(n.get_attributes(None)) + + with self.assertRaises(ValueError): + n.get_attributes() + + # Set attributes via indexing + _ = n[...] + self.assertEqual( + n.get_attributes(), + { + "cell_methods": "area: mean", + "coordinates": "time", + "project": "research", + "standard_name": "specific_humidity", + "units": "1", + }, + ) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cfdm.environment() + print("") + unittest.main(verbosity=2) diff --git a/cfdm/test/test_NetCDFArray.py b/cfdm/test/test_NetCDFArray.py deleted file mode 100644 index 5ff1b69b3f..0000000000 --- a/cfdm/test/test_NetCDFArray.py +++ /dev/null @@ -1,103 +0,0 @@ -import atexit -import datetime -import faulthandler -import os -import tempfile -import unittest - -faulthandler.enable() # to debug seg faults and timeouts - -import cfdm - -n_tmpfiles = 1 -tmpfiles = [ - tempfile.mkstemp("_test_netCDF.nc", dir=os.getcwd())[1] - for i in range(n_tmpfiles) -] -(tmpfile,) = tmpfiles - - -def _remove_tmpfiles(): - """Remove temporary files created during tests.""" - for f in tmpfiles: - try: - os.remove(f) - except OSError: - pass - - -atexit.register(_remove_tmpfiles) - - -class NetCDFTest(unittest.TestCase): - """Unit test for the NetCDF class.""" - - def setUp(self): - """Preparations called immediately before each test method.""" - # Disable log messages to silence expected warnings - cfdm.log_level("DISABLE") - # Note: to enable all messages for given methods, lines or - # calls (those without a 'verbose' option to do the same) - # e.g. to debug them, wrap them (for methods, start-to-end - # internally) as follows: - # - # cfdm.LOG_LEVEL('DEBUG') - # < ... test code ... > - # cfdm.log_level('DISABLE') - - def test_NetCDFArray_get_addresses(self): - """Test `NetCDFArray.get_addresses`""" - a = cfdm.NetCDFArray(address="tas") - self.assertEqual(a.get_addresses(), ("tas",)) - - a = cfdm.NetCDFArray(address=("tas1", "tas1")) - self.assertEqual(a.get_addresses(), ("tas1", "tas1")) - - a = cfdm.NetCDFArray() - self.assertEqual(a.get_addresses(), ()) - - def test_NetCDFArray_get_filenames(self): - """Test `NetCDFArray.get_filenames`""" - a = cfdm.NetCDFArray("/data1/file1") - self.assertEqual(a.get_filenames(), ("/data1/file1",)) - - a = cfdm.NetCDFArray(("/data1/file1",)) - self.assertEqual(a.get_filenames(), ("/data1/file1",)) - - a = cfdm.NetCDFArray(("/data1/file1", "/data2/file2")) - self.assertEqual(a.get_filenames(), ("/data1/file1", "/data2/file2")) - - a = cfdm.NetCDFArray() - self.assertEqual(a.get_filenames(), ()) - - def test_NetCDFArray_get_missing_values(self): - """Test NetCDFArray.get_missing_values.""" - f = cfdm.example_field(0) - - f.set_property("missing_value", -999) - f.set_property("_FillValue", -3) - f.set_property("valid_range", [-111, 222]) - cfdm.write(f, tmpfile) - - g = cfdm.read(tmpfile)[0] - self.assertEqual( - g.data.source().get_missing_values(), - { - "missing_value": -999.0, - "_FillValue": -3, - "valid_range": (-111, 222), - }, - ) - - c = g.coordinate("latitude") - self.assertEqual(c.data.source().get_missing_values(), {}) - - a = cfdm.NetCDFArray("file.nc", "ncvar") - self.assertIsNone(a.get_missing_values()) - - -if __name__ == "__main__": - print("Run date:", datetime.datetime.now()) - cfdm.environment() - print("") - unittest.main(verbosity=2) diff --git a/cfdm/test/test_groups.py b/cfdm/test/test_groups.py index c8323bf967..17772c4268 100644 --- a/cfdm/test/test_groups.py +++ b/cfdm/test/test_groups.py @@ -46,6 +46,30 @@ def _remove_tmpfiles(): class GroupsTest(unittest.TestCase): """Test treatment of netCDF4 files with hierarchical groups.""" + def _check_h5netcdf_groups(self, h5, nc): + """Check that an h5netcdf read gives same results as netCDF4. + + :Parameters: + + h5: `Field` + + nc: `Field` + + :Returns: + + `None` + + """ + self.assertTrue(h5.equals(nc, verbose=3)) + self.assertEqual(h5.nc_variable_groups(), nc.nc_variable_groups()) + for key, ch5 in h5.constructs.items(): + if hasattr(ch5, "nc_variable_groups"): + self.assertEqual( + ch5.nc_variable_groups(), + nc.constructs[key].nc_variable_groups(), + key, + ) + def setUp(self): """Preparations called immediately before each test method.""" # Disable log messages to silence expected warnings @@ -65,6 +89,7 @@ def test_groups(self): ungrouped_file = ungrouped_file1 grouped_file = grouped_file1 + # grouped_file = "delme_grouped.nc" # Add a second grid mapping datum = cfdm.Datum(parameters={"earth_radius": 7000000}) @@ -103,9 +128,12 @@ def test_groups(self): ) nc.close() + grouped_file = grouped_file1 + h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = h[0] + self.assertTrue(f.equals(h)) # ------------------------------------------------------------ # Move constructs one by one to the /forecast group. The order @@ -135,7 +163,7 @@ def test_groups(self): # Check that the field construct hasn't changed h = cfdm.read(grouped_file, verbose=1) - self.assertEqual(len(h), 1, repr(h)) + self.assertEqual(len(h), 1) self.assertTrue(f.equals(h[0], verbose=2), name) # ------------------------------------------------------------ @@ -152,9 +180,17 @@ def test_groups(self): ) nc.close() - h = cfdm.read(grouped_file, verbose="WARNING") - self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = cfdm.read( + grouped_file, netcdf_backend="netCDF4", verbose="WARNING" + ) + self.assertEqual(len(h), 1) + h = h[0] + self.assertTrue(f.equals(h, verbose=2)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_geometry(self): """Test that geometries are considered in the correct groups.""" @@ -281,7 +317,13 @@ def test_groups_geometry(self): # Check that the field construct hasn't changed h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = h[0] + self.assertTrue(f.equals(h, verbose=2)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_compression(self): """Test the compression of hierarchical groups.""" @@ -294,7 +336,7 @@ def test_groups_compression(self): f.data.get_count().nc_set_variable("count") f.data.get_index().nc_set_variable("index") - cfdm.write(f, ungrouped_file, verbose=1) + cfdm.write(f, ungrouped_file) g = cfdm.read(ungrouped_file)[0] self.assertTrue(f.equals(g, verbose=2)) @@ -348,7 +390,13 @@ def test_groups_compression(self): h = cfdm.read(grouped_file, verbose=1) self.assertEqual(len(h), 1, repr(h)) - self.assertTrue(f.equals(h[0], verbose=2)) + h = h[0] + self.assertTrue(f.equals(h, verbose=2)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_dimension(self): """Test the dimensions of hierarchical groups.""" @@ -418,6 +466,11 @@ def test_groups_dimension(self): h = h[0] self.assertTrue(f.equals(h, verbose=3)) + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) + def test_groups_unlimited_dimension(self): """Test the group behaviour of an unlimited dimension.""" f = cfdm.example_field(0) @@ -448,12 +501,18 @@ def test_groups_unlimited_dimension(self): f.nc_set_variable_groups(["forecast", "model"]) grouped_file = grouped_file5 + cfdm.write(f, grouped_file5, verbose=1) - h = cfdm.read(grouped_file, verbose=1) + h = cfdm.read(grouped_file, netcdf_backend="netCDF4") self.assertEqual(len(h), 1) h = h[0] - self.assertTrue(f.equals(h, verbose=3)) + self.assertTrue(f.equals(h)) + + # Check that h5netcdf reads the file correctly + h5 = cfdm.read(grouped_file, netcdf_backend="h5netcdf") + self.assertEqual(len(h5), 1) + self._check_h5netcdf_groups(h5[0], h) def test_groups_identical_coordinates(self): """Test for identical coordinates in different groups.""" diff --git a/cfdm/test/test_netcdf_indexer.py b/cfdm/test/test_netcdf_indexer.py new file mode 100644 index 0000000000..571f8cdc57 --- /dev/null +++ b/cfdm/test/test_netcdf_indexer.py @@ -0,0 +1,228 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest + +import netCDF4 +import numpy as np + +import cfdm + +faulthandler.enable() # to debug seg faults and timeouts + +n_tmpfiles = 1 +tmpfiles = [ + tempfile.mkstemp("_test_netcdf_indexer.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +(tmpfile,) = tmpfiles + + +def _remove_tmpfiles(): + """Remove temporary files created during tests.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + +netcdf_backends = ("netCDF4", "h5netcdf") + + +class netcdf_indexerTest(unittest.TestCase): + """Test the masking and scaling of netCDF data.""" + + def test_netcdf_indexer_shape(self): + """Test netcdf_indexer shape.""" + n = np.ma.arange(9) + x = cfdm.netcdf_indexer(n) + self.assertEqual(x.shape, n.shape) + self.assertEqual(x.size, n.size) + self.assertEqual(x.ndim, n.ndim) + self.assertEqual(x.dtype, n.dtype) + + def test_netcdf_indexer_mask(self): + """Test netcdf_indexer for masking.""" + f0 = cfdm.example_field(0) + f0.del_property("missing_value", None) + f0.del_property("_FillValue", None) + fields = [f0.copy()] + + f0.data[1, :] = np.ma.masked + fields.append(f0) + + f = f0.copy() + f.set_property("missing_value", 999) + fields.append(f) + + f = f0.copy() + f.set_property("_FillValue", 999) + fields.append(f) + + f = f0.copy() + valid_min = f.array.min() * 1.1 + f.set_property("valid_min", valid_min) + fields.append(f) + + f = f0.copy() + valid_max = f.array.max() * 0.9 + f.set_property("valid_max", valid_max) + fields.append(f) + + f = f0.copy() + f.set_property("valid_range", [valid_min, valid_max]) + fields.append(f) + + cfdm.write(fields, tmpfile, warn_valid=False) + + # Check against netCDF4 with set_auto_maskandscale(True) + nc = netCDF4.Dataset(tmpfile, "r") + nc.set_auto_maskandscale(True) + nc.set_always_mask(True) + for backend in netcdf_backends: + f = cfdm.read(tmpfile, netcdf_backend=backend) + for g in f: + ncvar = g.nc_get_variable() + n = nc.variables[ncvar] + na = n[...] + self.assertTrue((g.array == na).all()) + self.assertTrue((g.data.mask.array == na.mask).all()) + + nc.close() + + def test_netcdf_indexer_unpack(self): + """Test netcdf_indexer for unpacking.""" + f = cfdm.example_field(0) + + array = np.ma.arange(40, dtype="int32").reshape(f.shape) + array[1, :] = np.ma.masked + + data = cfdm.Data(array, units=f.get_property("units")) + f.set_data(data) + scale_factor = 0.5 + add_offset = 10.0 + f.set_property("scale_factor", scale_factor) + f.set_property("add_offset", add_offset) + f.set_property("missing_value", 999) + + cfdm.write(f, tmpfile) + + # Check against netCDF4 with set_auto_maskandscale(True) + nc = netCDF4.Dataset(tmpfile, "r") + nc.set_auto_maskandscale(True) + nc.set_always_mask(True) + for backend in netcdf_backends: + f = cfdm.read(tmpfile, netcdf_backend=backend) + for g in f: + ncvar = g.nc_get_variable() + n = nc.variables[ncvar] + na = n[...] + self.assertTrue((g.array == na).all()) + self.assertTrue((g.data.mask.array == na.mask).all()) + + nc.close() + + def test_netcdf_indexer_numpy(self): + """Test netcdf_indexer for numpy.""" + array = np.ma.arange(9) + x = cfdm.netcdf_indexer(array) + x = x[...] + self.assertTrue((x == array).all()) + + x = cfdm.netcdf_indexer( + array.copy(), attributes={"_FillValue": 4, "missing_value": (0, 8)} + ) + x = x[...] + array[[0, 4, 8]] = np.ma.masked + self.assertTrue((x.mask == array.mask).all()) + self.assertTrue((x == array).all()) + + def test_netcdf_indexer_orthogonal_indexing(self): + """Test netcdf_indexer for numpy orthogonal indexing.""" + array = np.ma.arange(120).reshape(2, 3, 4, 5) + x = cfdm.netcdf_indexer( + array, mask=False, unpack=False, orthogonal_indexing=True + ) + + y = x[..., [0, 2], :] + a = array[..., [0, 2], :] + self.assertTrue((y == a).all()) + + y = x[1, ..., [0, 2], [0, 2, 3]] + a = array[:, :, [0, 2], :] + a = a[..., [0, 2, 3]] + a = a[1, ...] + self.assertTrue((y == a).all()) + + def test_netcdf_indexer_non_orthogonal_indexing(self): + """Test netcdf_indexer for numpy non-orthogonal indexing.""" + array = np.ma.arange(120).reshape(2, 3, 4, 5) + x = cfdm.netcdf_indexer(array, mask=False, unpack=False) + + y = x[..., [0, 2], :] + a = array[..., [0, 2], :] + self.assertTrue((y == a).all()) + + index = (Ellipsis, [0, 2], [2, 3]) + y = x[index] + a = array[index] + self.assertEqual(y.shape, a.shape) + self.assertTrue((y == a).all()) + + index = (1, Ellipsis, [0, 2], [2, 3]) + y = x[index] + a = array[index] + self.assertEqual(y.shape, a.shape) + self.assertTrue((y == a).all()) + + def test_netcdf_always_masked_array(self): + """Test netcdf_indexer for numpy masked output.""" + array = np.ma.arange(9) + x = cfdm.netcdf_indexer(array) + self.assertFalse(np.ma.isMA(x[...])) + x = cfdm.netcdf_indexer(array, always_masked_array=True) + self.assertTrue(np.ma.isMA(x[...])) + + def test_netcdf_indexer_Ellipsis(self): + """Test netcdf_indexer with Ellipsis.""" + n = np.arange(9) + x = cfdm.netcdf_indexer(n) + self.assertTrue((x[...] == n).all()) + + def test_netcdf_indexer_index_shape(self): + """Test netcdf_indexer shape.""" + x = cfdm.netcdf_indexer + self.assertEqual(x.index_shape((slice(2, 5), [4]), (10, 20)), [3, 1]) + self.assertEqual(x.index_shape((slice(2, 5), 4), (10, 20)), [3]) + self.assertEqual( + x.index_shape(([2, 3, 4], np.arange(1, 6)), (10, 20)), [3, 5] + ) + + self.assertEqual( + x.index_shape((slice(None), [True, False, True]), (10, 3)), [10, 2] + ) + + index0 = np.arange(5) + index0 = index0[index0 < 3] + self.assertEqual(x.index_shape((index0, []), (10, 20)), [3, 0]) + + self.assertEqual( + x.index_shape((slice(1, 5, 3), [3]), (10, 20)), [2, 1] + ) + self.assertEqual(x.index_shape((slice(5, 1, -2), 3), (10, 20)), [2]) + self.assertEqual( + x.index_shape((slice(5, 1, 3), [3]), (10, 20)), [0, 1] + ) + self.assertEqual(x.index_shape((slice(1, 5, -3), 3), (10, 20)), [0]) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cfdm.environment() + print() + unittest.main(verbosity=2) diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py index 260a52e579..040ad1a70c 100644 --- a/cfdm/test/test_read_write.py +++ b/cfdm/test/test_read_write.py @@ -671,18 +671,19 @@ def test_read_CDL(self): def test_read_write_string(self): """Test the `string` keyword argument to `read` and `write`.""" - f = cfdm.read(self.string_filename) + fN = cfdm.read(self.string_filename, netcdf_backend="netCDF4") + fH = cfdm.read(self.string_filename, netcdf_backend="h5netcdf") - n = int(len(f) / 2) + n = int(len(fN) / 2) for i in range(0, n): j = i + n - self.assertTrue( - f[i].data.equals(f[j].data, verbose=3), f"{f[i]!r} {f[j]!r}" - ) - self.assertTrue( - f[j].data.equals(f[i].data, verbose=3), f"{f[j]!r} {f[i]!r}" - ) + self.assertTrue(fN[i].data.equals(fN[j].data, verbose=3)) + self.assertTrue(fN[j].data.equals(fN[i].data, verbose=3)) + + # Check that netCDF4 and h5netcdf give the same results + for i, j in zip(fN, fH): + self.assertTrue(i.data.equals(j.data)) # Note: Don't loop round all netCDF formats for better # performance. Just one netCDF3 and one netCDF4 format @@ -926,8 +927,8 @@ def test_write_omit_data(self): g = g[0] # Check that the data are missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) # Check that a dump works g.dump(display=False) @@ -937,16 +938,16 @@ def test_write_omit_data(self): # Check that only the field and dimension coordinate data are # missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) - self.assertTrue(g.construct("latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) + self.assertTrue(np.ma.count(g.construct("latitude").array)) cfdm.write(f, tmpfile, omit_data="field") g = cfdm.read(tmpfile)[0] # Check that only the field data are missing - self.assertFalse(g.array.count()) - self.assertTrue(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertTrue(np.ma.count(g.construct("grid_latitude").array)) def test_read_write_domain_ancillary(self): """Test when domain ancillary equals dimension coordinate.""" diff --git a/docs/source/class.rst b/docs/source/class.rst index f08e8e6c99..e2536513fa 100644 --- a/docs/source/class.rst +++ b/docs/source/class.rst @@ -74,7 +74,8 @@ Data classes :toctree: class/ cfdm.Data - cfdm.NetCDFArray + cfdm.NetCDF4Array + cfdm.H5netcdfArray cfdm.NumpyArray cfdm.Array diff --git a/docs/source/class/cfdm.H5netcdfArray.rst b/docs/source/class/cfdm.H5netcdfArray.rst new file mode 100644 index 0000000000..18b3d07a09 --- /dev/null +++ b/docs/source/class/cfdm.H5netcdfArray.rst @@ -0,0 +1,110 @@ +.. currentmodule:: cfdm +.. default-role:: obj + +cfdm.H5netcdfArray +================== + +---- + +.. autoclass:: cfdm.H5netcdfArray + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.get_compression_type + ~cfdm.H5netcdfArray.get_subspace + ~cfdm.H5netcdfArray.get_attributes + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cfdm.H5netcdfArray.array + ~cfdm.H5netcdfArray.dtype + ~cfdm.H5netcdfArray.ndim + ~cfdm.H5netcdfArray.shape + ~cfdm.H5netcdfArray.size + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.get_calendar + ~cfdm.H5netcdfArray.get_units + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.get_address + ~cfdm.H5netcdfArray.get_addresses + ~cfdm.H5netcdfArray.close + ~cfdm.H5netcdfArray.open + ~cfdm.H5netcdfArray.get_filename + ~cfdm.H5netcdfArray.get_filenames + ~cfdm.H5netcdfArray.get_format + ~cfdm.H5netcdfArray.get_formats + ~cfdm.H5netcdfArray.get_groups + ~cfdm.H5netcdfArray.get_mask + ~cfdm.H5netcdfArray.get_unpack + ~cfdm.H5netcdfArray.get_storage_options + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.copy + ~cfdm.H5netcdfArray.to_memory + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.H5netcdfArray._docstring_special_substitutions + ~cfdm.H5netcdfArray._docstring_substitutions + ~cfdm.H5netcdfArray._docstring_package_depth + ~cfdm.H5netcdfArray._docstring_method_exclusions diff --git a/docs/source/class/cfdm.NetCDF4Array.rst b/docs/source/class/cfdm.NetCDF4Array.rst new file mode 100644 index 0000000000..6ef0c047b5 --- /dev/null +++ b/docs/source/class/cfdm.NetCDF4Array.rst @@ -0,0 +1,110 @@ +.. currentmodule:: cfdm +.. default-role:: obj + +cfdm.NetCDF4Array +================= + +---- + +.. autoclass:: cfdm.NetCDF4Array + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.get_compression_type + ~cfdm.NetCDF4Array.get_subspace + ~cfdm.NetCDF4Array.get_attributes + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cfdm.NetCDF4Array.array + ~cfdm.NetCDF4Array.dtype + ~cfdm.NetCDF4Array.ndim + ~cfdm.NetCDF4Array.shape + ~cfdm.NetCDF4Array.size + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.get_calendar + ~cfdm.NetCDF4Array.get_units + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.get_address + ~cfdm.NetCDF4Array.get_addresses + ~cfdm.NetCDF4Array.close + ~cfdm.NetCDF4Array.open + ~cfdm.NetCDF4Array.get_filename + ~cfdm.NetCDF4Array.get_filenames + ~cfdm.NetCDF4Array.get_format + ~cfdm.NetCDF4Array.get_formats + ~cfdm.NetCDF4Array.get_groups + ~cfdm.NetCDF4Array.get_mask + ~cfdm.NetCDF4Array.get_unpack + ~cfdm.NetCDF4Array.get_storage_options + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.copy + ~cfdm.NetCDF4Array.to_memory + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDF4Array._docstring_special_substitutions + ~cfdm.NetCDF4Array._docstring_substitutions + ~cfdm.NetCDF4Array._docstring_package_depth + ~cfdm.NetCDF4Array._docstring_method_exclusions diff --git a/docs/source/class/cfdm.NetCDFArray.rst b/docs/source/class/cfdm.NetCDFArray.rst deleted file mode 100644 index 989e90cae1..0000000000 --- a/docs/source/class/cfdm.NetCDFArray.rst +++ /dev/null @@ -1,108 +0,0 @@ -.. currentmodule:: cfdm -.. default-role:: obj - -cfdm.NetCDFArray -================ - ----- - -.. autoclass:: cfdm.NetCDFArray - :no-members: - :no-inherited-members: - -Inspection ----------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.get_compression_type - ~cfdm.NetCDFArray.get_subspace - ~cfdm.NetCDFArray.get_missing_values - -.. rubric:: Attributes - -.. autosummary:: - :nosignatures: - :toctree: ../attribute/ - :template: attribute.rst - - ~cfdm.NetCDFArray.array - ~cfdm.NetCDFArray.dtype - ~cfdm.NetCDFArray.ndim - ~cfdm.NetCDFArray.shape - ~cfdm.NetCDFArray.size - -Units ------ - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.get_calendar - ~cfdm.NetCDFArray.get_units - -File ----- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.get_address - ~cfdm.NetCDFArray.get_addresses - ~cfdm.NetCDFArray.close - ~cfdm.NetCDFArray.open - ~cfdm.NetCDFArray.get_filename - ~cfdm.NetCDFArray.get_filenames - ~cfdm.NetCDFArray.get_format - ~cfdm.NetCDFArray.get_formats - ~cfdm.NetCDFArray.get_groups - ~cfdm.NetCDFArray.get_mask - -Miscellaneous -------------- - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.copy - ~cfdm.NetCDFArray.to_memory - -Special -------- - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray.__getitem__ - -Docstring substitutions ------------------------ - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cfdm.NetCDFArray._docstring_special_substitutions - ~cfdm.NetCDFArray._docstring_substitutions - ~cfdm.NetCDFArray._docstring_package_depth - ~cfdm.NetCDFArray._docstring_method_exclusions diff --git a/docs/source/class/cfdm.NetCDFIndexer.rst b/docs/source/class/cfdm.NetCDFIndexer.rst new file mode 100644 index 0000000000..6dd64d263f --- /dev/null +++ b/docs/source/class/cfdm.NetCDFIndexer.rst @@ -0,0 +1,42 @@ +.. currentmodule:: cfdm +.. default-role:: obj + +cfdm.NetCDFIndexer +================== + +---- + +.. autoclass:: cfdm.NetCDFIndexer + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDFIndexer.attributes + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cfdm.NetCDFIndexer.shape + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cfdm.NetCDFIndexer.__getitem__ diff --git a/docs/source/conf.py b/docs/source/conf.py index eecf98b9f4..d01b557778 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -131,6 +131,7 @@ def _get_date(): "numpy": ("https://numpy.org/doc/stable", None), # 'netCDF4': ('https://unidata.github.io/netcdf4-python/', None), "cftime": ("https://unidata.github.io/cftime", None), + "h5netcdf": ("https://h5netcdf.org", None), } # This extension is meant to help with the common pattern of having diff --git a/docs/source/extensions.rst b/docs/source/extensions.rst index 3b18480071..aaac4d3b26 100644 --- a/docs/source/extensions.rst +++ b/docs/source/extensions.rst @@ -150,7 +150,7 @@ in overridden methods. Data=cfdm.Data, GatheredArray=cfdm.GatheredArray, - NetCDFArray=cfdm.NetCDFArray, + NetCDF4Array=cfdm.NetCDF4Array, RaggedContiguousArray=cfdm.RaggedContiguousArray, RaggedIndexedArray=cfdm.RaggedIndexedArray, RaggedIndexedContiguousArray=cfdm.RaggedIndexedContiguousArray, diff --git a/docs/source/functions.rst b/docs/source/functions.rst index 5919fe972b..fc7b5a9018 100644 --- a/docs/source/functions.rst +++ b/docs/source/functions.rst @@ -20,6 +20,7 @@ Reading and writing cfdm.read cfdm.write + cfdm.netcdf_flatten Constants --------- diff --git a/docs/source/installation.rst b/docs/source/installation.rst index ad36e5beed..bbed790317 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -152,20 +152,26 @@ Tests are run from within the ``cfdm/test`` directory: The cfdm package requires: -* `Python `_, version 3.8 or newer. +* `Python `_, version 3.8 or newer. -* `numpy `_, version 1.15 or newer. +* `numpy `_, version 1.15 or newer. -* `netCDF4 `_, version 1.5.4 or +* `netCDF4 `_, version 1.5.4 or newer. -* `cftime `_, version 1.6.0 or +* `cftime `_, version 1.6.0 or newer. -* `netcdf_flattener `_, - version 1.2.0 or newer. +* `h5netcdf `_, version 1.3.0 + newer. + +* `h5py `_, version 3.10.0 or newer. + +* `s3fs `_, version 2024.2.0 or newer. + +* `dask `_, version 2024.2.1 or newer. -* `packaging `_, version 20.0 or +* `packaging `_, version 20.0 or newer. * `scipy `_, version 1.10.0 or newer. diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 8cb2242ceb..8ddbe417e7 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -76,12 +76,17 @@ to add more sophisticated methods. The cfdm package can * read :term:`field constructs ` and :term:`domain - constructs ` from netCDF and CDL datasets, + constructs ` from netCDF and CDL datasets with a + choice of netCDF backends, +* read files from OPeNDAP servers and S3 object stores, + * create new field and domain constructs in memory, * write field and domain constructs to netCDF datasets on disk, +* read, write, and manipulate UGRID mesh topologies, + * read, write, and create coordinates defined by geometry cells, * read and write netCDF4 string data-type variables, @@ -144,9 +149,10 @@ If you use cfdm, either as a stand-alone application or to provide a CF data model implementation to another software library, please consider including the reference: -Hassell, D., and Bartholomew, S. L. (2020). cfdm: A Python reference - implementation of the CF data model. Journal of Open Source - Software, 5(54), 2717, https://doi.org/10.21105/joss.02717 +Hassell, D., and Bartholomew, S. L. (2020). + cfdm: A Python reference implementation of the CF data + model. Journal of Open Source Software, 5(54), 2717, + https://doi.org/10.21105/joss.02717 .. code-block:: bibtex @@ -168,28 +174,30 @@ Hassell, D., and Bartholomew, S. L. (2020). cfdm: A Python reference **References** -------------- -Eaton, B., Gregory, J., Drach, B., Taylor, K., Hankin, S., Caron, J., - Signell, R., et al. (2020). NetCDF Climate and Forecast (CF) +Eaton, B., Gregory, J., Drach, B., Taylor, K., Hankin, S., Caron, J., Signell, R., et al. (2020). + NetCDF Climate and Forecast (CF) Metadata Conventions. CF Conventions Committee. Retrieved from https://cfconventions.org/cf-conventions/cf-conventions.html -Hassell, D., and Bartholomew, S. L. (2020). cfdm: A Python reference - implementation of the CF data model. Journal of Open Source - Software, 5(54), 2717, https://doi.org/10.21105/joss.02717 +Hassell, D., and Bartholomew, S. L. (2020). + cfdm: A Python reference implementation of the CF data + model. Journal of Open Source Software, 5(54), 2717, + https://doi.org/10.21105/joss.02717 -Hassell, D., Gregory, J., Blower, J., Lawrence, B. N., and - Taylor, K. E. (2017). A data model of the Climate and Forecast - metadata conventions (CF-1.6) with a software implementation - (cf-python v2.1), Geosci. Model Dev., 10, 4619-4646, +Hassell, D., Gregory, J., Blower, J., Lawrence, B. N., and Taylor, K. E. (2017). + A data model of the Climate and Forecast metadata conventions + (CF-1.6) with a software implementation (cf-python v2.1), + Geosci. Model Dev., 10, 4619-4646, https://doi.org/10.5194/gmd-10-4619-2017 -Rew, R., and Davis, G. (1990). NetCDF: An Interface for Scientific - Data Access. IEEE Computer Graphics and Applications, 10(4), +Rew, R., and Davis, G. (1990). + NetCDF: An Interface for Scientific Data Access. IEEE Computer + Graphics and Applications, 10(4), 76–82. https://doi.org/10.1109/38.56302 -Rew, R., Hartnett, E., and Caron, J. (2006). NetCDF-4: Software - Implementing an Enhanced Data Model for the Geosciences. In 22nd - International Conference on Interactive Information Processing - Systems for Meteorology, Oceanography, and Hydrology. AMS. Retrieved - from +Rew, R., Hartnett, E., and Caron, J. (2006). + NetCDF-4: Software Implementing an Enhanced Data Model for the + Geosciences. In 22nd International Conference on Interactive + Information Processing Systems for Meteorology, Oceanography, and + Hydrology. AMS. Retrieved from https://www.unidata.ucar.edu/software/netcdf/papers/2006-ams.pdf diff --git a/docs/source/spelling_false_positives.txt b/docs/source/spelling_false_positives.txt index 013dff0ae5..4525de3c61 100644 --- a/docs/source/spelling_false_positives.txt +++ b/docs/source/spelling_false_positives.txt @@ -8,6 +8,8 @@ atol ATOL AuxiliaryCoordinate auxiliarycoordinate +backend +backends basenames Booleans bool @@ -77,6 +79,7 @@ hashable Hassell hdf indexable +init initio inplace instantiation @@ -105,9 +108,11 @@ ncvars nd ndim ness -netcdf netCDF +netcdf NetCDFArray +netcdfArray +NetCDFIndexer Nino nonzero numpy @@ -168,6 +173,7 @@ uncompresses uncompressing unicode unfilter +url varid verboseness versionadded diff --git a/docs/source/tutorial.py b/docs/source/tutorial.py index 50c2f209f7..923e6e9186 100644 --- a/docs/source/tutorial.py +++ b/docs/source/tutorial.py @@ -538,8 +538,8 @@ import netCDF4 nc = netCDF4.Dataset('file.nc', 'r') v = nc.variables['ta'] -netcdf_array = cfdm.NetCDFArray(filename='file.nc', address='ta', - dtype=v.dtype, shape=v.shape) +netcdf_array = cfdm.NetCDF4Array(filename='file.nc', address='ta', + dtype=v.dtype, shape=v.shape) data_disk = cfdm.Data(netcdf_array) numpy_array = v[...] data_memory = cfdm.Data(numpy_array) diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 0642d7fec0..edc0636272 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -113,11 +113,11 @@ instance or `cfdm.Domain` instance respectively. Henceforth the phrase ---------------------------------------------------- The `cfdm.read` function reads a `netCDF -`_ file from disk, or -from an `OPeNDAP `_ URL [#dap]_, and by -default returns the contents as a Python list of zero or more field -constructs. This list contains a field construct to represent each of -the CF-netCDF data variables in the file. +`_ file from disk, from +an `OPeNDAP `_ URL [#dap]_, or from an S3 +object store, and by default returns the contents as a Python list of +zero or more field constructs. This list contains a field construct to +represent each of the CF-netCDF data variables in the file. Datasets of any version of CF up to and including CF-|version| can be read. @@ -184,7 +184,10 @@ The `cfdm.read` function has optional parameters to attributes are present (see :ref:`data masking `); and * display information and issue warnings about the mapping of the - netCDF file contents to CF data model constructs. + netCDF file contents to CF data model constructs; and + +* choose either `netCDF4` or `h5netcdf` backends for accessing netCDF + files. .. _CF-compliance: @@ -2853,19 +2856,20 @@ All the of above examples use arrays in memory to construct the data instances for the field and metadata constructs. It is, however, possible to create data from arrays that reside on disk. The `cfdm.read` function creates data in this manner. A pointer to an -array in a netCDF file can be stored in a `~cfdm.NetCDFArray` -instance, which is is used to initialise a `~cfdm.Data` instance. +array in a netCDF file can be stored in a `~cfdm.NetCDF4Array` or +`~cfdm.H5netcdfAarray` instance, which is used to initialise a +`~cfdm.Data` instance. .. code-block:: python :caption: *Define a variable from a dataset with the netCDF package - and use it to create a NetCDFArray instance with which to + and use it to create a NetCDF4Array instance with which to initialise a Data instance.* >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') >>> v = nc.variables['ta'] - >>> netcdf_array = cfdm.NetCDFArray(filename='file.nc', address='ta', - ... dtype=v.dtype, shape=v.shape) + >>> netcdf_array = cfdm.NetCDF4Array(filename='file.nc', address='ta', + ... dtype=v.dtype, shape=v.shape) >>> data_disk = cfdm.Data(netcdf_array) @@ -2881,7 +2885,7 @@ instance, which is is used to initialise a `~cfdm.Data` instance. Note that data type, number of dimensions, dimension sizes and number of elements of the array on disk that are used to initialise the -`~cfdm.NetCDFArray` instance are those expected by the CF data model, +`~cfdm.NetCDF4Array` instance are those expected by the CF data model, which may be different to those of the netCDF variable in the file (although they are the same in the above example). For example, a netCDF character array of shape ``(12, 9)`` is viewed in cfdm as a diff --git a/release_docs b/release_docs index 8367d81f14..b4ea153096 100755 --- a/release_docs +++ b/release_docs @@ -11,7 +11,7 @@ version=`python -c "import cfdm; print(cfdm.__version__)"` sphinx_version=`python -c "import sphinx; print(sphinx.__version__)"` if [[ $sphinx_version != "2.4.5" ]] ; then - echo "ERROR: Must use sphinx version 2.4.5. Got $sphinx_version" + echo "ERROR: Must (sadly) use sphinx version 2.4.5. Got $sphinx_version" exit 3 fi diff --git a/requirements.txt b/requirements.txt index c5f4955b2d..586e45931d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,10 @@ netCDF4>=1.5.4 cftime>=1.6.0 numpy>=1.15 -netcdf-flattener>=1.2.0 packaging>=20.0 scipy>=1.10.0 +h5netcdf>=1.3.0 +h5py>=3.10.0 +s3fs>=2024.6.0 +dask>=2024.6.0 + diff --git a/setup.py b/setup.py index 9a2172da9d..19e3c96db2 100755 --- a/setup.py +++ b/setup.py @@ -67,7 +67,7 @@ def _get_version(): The **cfdm** package can -* read field and domain constructs from netCDF and CDL datasets, +* read field and domain constructs from netCDF and CDL datasets with a choice of netCDF backends, * create new field and domain constructs in memory, * write and append field and domain constructs to netCDF datasets on disk, * read, write, and manipulate UGRID mesh topologies, @@ -198,6 +198,7 @@ def _get_version(): "cfdm.read_write", "cfdm.read_write.abstract", "cfdm.read_write.netcdf", + "cfdm.read_write.netcdf.flatten", "cfdm.test", ], scripts=["scripts/cfdump"],