diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 85207528..855b6b0c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -95,6 +95,11 @@ New features and enhancements * Conservative regridding now supports oblique mercator projections. (:pull:`467`). * The automatic name for the weight file in ``regrid_dataset`` is now more explicit to avoid errors, but now requires `cat:id` and `cat:domain` arguments for both the source and target datasets. (:pull:`467`). +Breaking changes +^^^^^^^^^^^^^^^^ +* Version facet is now optional in default filepath schemas for non-simulations a with "source_version" level. (:issue:`500`, :pull:`501`). +* Catalog attributes are removed by default in ``save_to_zarr`` and ``save_to_netcdf``. Catalog attributes are those added from the catalog columns by ``to_dataset``, ``to_dataset_dict`` and ``extract_dataset``, which have names prefixed with ``cat:``. (:issue:`499`, :pull:`501`). + Bug fixes ^^^^^^^^^ * Fixed bug with reusing weights. (:issue:`411`, :pull:`414`). diff --git a/src/xscen/catutils.py b/src/xscen/catutils.py index 7737a5bc..3941ceeb 100644 --- a/src/xscen/catutils.py +++ b/src/xscen/catutils.py @@ -1051,7 +1051,8 @@ def _get_needed_fields(schema: dict): needed.add(level) elif isinstance(level, list): for lvl in level: - needed.add(lvl) + if not (lvl.startswith("(") and lvl.endswith(")")): + needed.add(lvl) elif not (isinstance(level, dict) and list(level.keys()) == ["text"]): raise ValueError( f"Invalid schema with unknown {level} of type {type(level)}." diff --git a/src/xscen/data/file_schema.yml b/src/xscen/data/file_schema.yml index 36330e5b..29b4af13 100644 --- a/src/xscen/data/file_schema.yml +++ b/src/xscen/data/file_schema.yml @@ -13,7 +13,7 @@ # # There are four ways to specify a folder name to use: # - < facet > # The value of the facet. # - (< facet >) # Same, but if the facet is missing, this level is skipped, resulting in a tree of a different depth. -# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can't be optional. +# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can be optional. # - text: < value > # A fixed string # filename: # The file name schema, a list of facet names. If a facet is empty, it will be skipped. Elements will be separated by "_". # # The special "DATES" facet will be replaced by the most concise way found to define the temporal range covered by the file. @@ -33,7 +33,7 @@ original-non-sims: - type - domain - institution - - [ source, version ] + - [ source, (version) ] - (member) - frequency - variable @@ -92,7 +92,7 @@ original-hydro-reconstruction: - hydrology_source - (hydrology_member) - institution - - [ source, version ] + - [ source, (version) ] - (member) - frequency - variable @@ -199,7 +199,7 @@ derived-reconstruction: folders: - type - institution - - [ source, version ] + - [ source, (version) ] - (member) - domain - processing_level @@ -261,7 +261,7 @@ derived-hydro-reconstruction: - hydrology_source - (hydrology_member) - institution - - [ source, version ] + - [ source, (version) ] - (member) - domain - processing_level diff --git a/src/xscen/io.py b/src/xscen/io.py index e27f0c30..c7935f3e 100644 --- a/src/xscen/io.py +++ b/src/xscen/io.py @@ -24,7 +24,7 @@ from .config import parse_config from .scripting import TimeoutException -from .utils import TRANSLATOR, season_sort_key, translate_time_chunk +from .utils import TRANSLATOR, season_sort_key, strip_cat_attrs, translate_time_chunk logger = logging.getLogger(__name__) KEEPBITS = defaultdict(lambda: 12) @@ -374,6 +374,7 @@ def save_to_netcdf( bitround: bool | int | dict = False, compute: bool = True, netcdf_kwargs: dict | None = None, + strip_cat_metadata: bool = True, ): """Save a Dataset to NetCDF, rechunking or compressing if requested. @@ -399,6 +400,8 @@ def save_to_netcdf( Whether to start the computation or return a delayed object. netcdf_kwargs : dict, optional Additional arguments to send to_netcdf() + strip_cat_metadata : bool + If True (default), strips all catalog-added attributes before saving the dataset. Returns ------- @@ -425,6 +428,9 @@ def save_to_netcdf( # Remove original_shape from encoding, since it can cause issues with some engines. ds[var].encoding.pop("original_shape", None) + if strip_cat_metadata: + ds = strip_cat_attrs(ds) + _coerce_attrs(ds.attrs) for var in ds.variables.values(): _coerce_attrs(var.attrs) @@ -445,6 +451,7 @@ def save_to_zarr( # noqa: C901 mode: str = "f", itervar: bool = False, timeout_cleanup: bool = True, + strip_cat_metadata: bool = True, ): """ Save a Dataset to Zarr format, rechunking and compressing if requested. @@ -487,6 +494,8 @@ def save_to_zarr( # noqa: C901 If True (default) and a :py:class:`xscen.scripting.TimeoutException` is raised during the writing, the variable being written is removed from the dataset as it is incomplete. This does nothing if `compute` is False. + strip_cat_metadata : bool + If True (default), strips all catalog-added attributes before saving the dataset. Returns ------- @@ -561,6 +570,9 @@ def _skip(var): if len(ds.data_vars) == 0: return None + if strip_cat_metadata: + ds = strip_cat_attrs(ds) + _coerce_attrs(ds.attrs) for var in ds.variables.values(): _coerce_attrs(var.attrs) diff --git a/src/xscen/utils.py b/src/xscen/utils.py index c9e86701..f6b69475 100644 --- a/src/xscen/utils.py +++ b/src/xscen/utils.py @@ -633,6 +633,15 @@ def get_cat_attrs( return facets +def strip_cat_attrs(ds: xr.Dataset, prefix: str = "cat:"): + """Remove attributes added from the catalog by `to_dataset` or `extract_dataset`.""" + dsc = ds.copy() + for k in list(dsc.attrs): + if k.startswith(prefix): + del dsc.attrs[k] + return dsc + + @parse_config def maybe_unstack( ds: xr.Dataset, @@ -923,12 +932,10 @@ def clean_up( # noqa: C901 msg = f"Converting units: {variables_and_units}" logger.info(msg) ds = change_units(ds=ds, variables_and_units=variables_and_units) - # convert calendar if convert_calendar_kwargs: # create mask of grid point that should always be nan ocean = ds.isnull().all("time") - # if missing_by_var exist make sure missing data are added to time axis if missing_by_var: if not all(k in missing_by_var.keys() for k in ds.data_vars): diff --git a/tests/test_catutils.py b/tests/test_catutils.py index 40c7c857..f3e561f7 100644 --- a/tests/test_catutils.py +++ b/tests/test_catutils.py @@ -273,20 +273,28 @@ def test_pattern_from_schema(samplecat): assert any(res) -def test_build_path_ds(): +@pytest.mark.parametrize("hasver", [True, False]) +def test_build_path_ds(hasver): ds = xr.tutorial.open_dataset("air_temperature") ds = ds.assign(time=xr.cftime_range("0001-01-01", freq="6h", periods=ds.time.size)) ds.attrs.update(source="source", institution="institution") + if hasver: + ds.attrs["version"] = "v1" new_path = cu.build_path( ds, schemas={ - "folders": ["source", "institution", ["variable", "xrfreq"]], + "folders": [["source", "(version)"], "institution", ["variable", "xrfreq"]], "filename": ["source", "institution", "variable", "frequency", "DATES"], }, ) - assert new_path == Path( - "source/institution/air_6h/source_institution_air_6hr_0001-0002" - ) + if hasver: + assert new_path == Path( + "source_v1/institution/air_6h/source_institution_air_6hr_0001-0002" + ) + else: + assert new_path == Path( + "source/institution/air_6h/source_institution_air_6hr_0001-0002" + ) def test_build_path_multivar(samplecat): diff --git a/tests/test_utils.py b/tests/test_utils.py index 9b49b6c3..647ac33f 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -241,6 +241,10 @@ def test_get_cat_attrs(self, ds, prefix, var_as_str): elif prefix == "dog:": assert out == {"source": "CanESM5"} + def test_strip_cat_attrs(self): + out = xs.utils.strip_cat_attrs(self.ds) + assert list(out.attrs.keys()) == ["dog:source"] + class TestStack: def test_no_nan(self):