From e9891d1909b8efbc42889bc0b56b34d45b34d9e1 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 16 Dec 2024 18:40:50 -0500 Subject: [PATCH 1/4] Strip cat attrs --- src/xscen/io.py | 14 +++++++++++++- src/xscen/utils.py | 12 +++++++++--- tests/test_utils.py | 4 ++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/src/xscen/io.py b/src/xscen/io.py index 5d9a7763..009d9658 100644 --- a/src/xscen/io.py +++ b/src/xscen/io.py @@ -24,7 +24,7 @@ from .config import parse_config from .scripting import TimeoutException -from .utils import TRANSLATOR, season_sort_key, translate_time_chunk +from .utils import TRANSLATOR, season_sort_key, strip_cat_attrs, translate_time_chunk logger = logging.getLogger(__name__) KEEPBITS = defaultdict(lambda: 12) @@ -356,6 +356,7 @@ def save_to_netcdf( bitround: Union[bool, int, dict] = False, compute: bool = True, netcdf_kwargs: Optional[dict] = None, + strip_cat_metadata: bool = True, ): """Save a Dataset to NetCDF, rechunking or compressing if requested. @@ -381,6 +382,8 @@ def save_to_netcdf( Whether to start the computation or return a delayed object. netcdf_kwargs : dict, optional Additional arguments to send to_netcdf() + strip_cat_metadata : bool + If True (default), strips all catalog-added attributes before saving the dataset. Returns ------- @@ -407,6 +410,9 @@ def save_to_netcdf( # Remove original_shape from encoding, since it can cause issues with some engines. ds[var].encoding.pop("original_shape", None) + if strip_cat_metadata: + ds = strip_cat_attrs(ds) + _coerce_attrs(ds.attrs) for var in ds.variables.values(): _coerce_attrs(var.attrs) @@ -427,6 +433,7 @@ def save_to_zarr( # noqa: C901 mode: str = "f", itervar: bool = False, timeout_cleanup: bool = True, + strip_cat_metadata: bool = True, ): """Save a Dataset to Zarr format, rechunking and compressing if requested. @@ -467,6 +474,8 @@ def save_to_zarr( # noqa: C901 If True (default) and a :py:class:`xscen.scripting.TimeoutException` is raised during the writing, the variable being written is removed from the dataset as it is incomplete. This does nothing if `compute` is False. + strip_cat_metadata : bool + If True (default), strips all catalog-added attributes before saving the dataset. Returns ------- @@ -530,6 +539,9 @@ def _skip(var): if len(ds.data_vars) == 0: return None + if strip_cat_metadata: + ds = strip_cat_attrs(ds) + _coerce_attrs(ds.attrs) for var in ds.variables.values(): _coerce_attrs(var.attrs) diff --git a/src/xscen/utils.py b/src/xscen/utils.py index 6eb44b6e..4bf0a710 100644 --- a/src/xscen/utils.py +++ b/src/xscen/utils.py @@ -541,6 +541,15 @@ def get_cat_attrs( return facets +def strip_cat_attrs(ds: xr.Dataset, prefix: str = "cat:"): + """Remove attributes added from the catalog by `to_dataset` or `extract_dataset`.""" + dsc = ds.copy() + for k in list(dsc.attrs): + if k.startswith(prefix): + del dsc.attrs[k] + return dsc + + @parse_config def maybe_unstack( ds: xr.Dataset, @@ -825,13 +834,11 @@ def clean_up( # noqa: C901 if variables_and_units: logger.info(f"Converting units: {variables_and_units}") ds = change_units(ds=ds, variables_and_units=variables_and_units) - # convert calendar if convert_calendar_kwargs: ds_copy = ds.copy() # create mask of grid point that should always be nan ocean = ds_copy.isnull().all("time") - # if missing_by_var exist make sure missing data are added to time axis if missing_by_var: if not all(k in missing_by_var.keys() for k in ds.data_vars): @@ -846,7 +853,6 @@ def clean_up( # noqa: C901 logger.info(f"Converting calendar with {convert_calendar_kwargs} ") ds = convert_calendar(ds, **convert_calendar_kwargs).where(~ocean) - # convert each variable individually if missing_by_var: # remove 'missing' argument to be replace by `missing_by_var` diff --git a/tests/test_utils.py b/tests/test_utils.py index adcae76d..ac8a5c88 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -86,6 +86,10 @@ def test_get_cat_attrs(self, prefix, var_as_str): elif prefix == "dog:": assert out == {"source": "CanESM5"} + def test_strip_cat_attrs(self): + out = xs.utils.strip_cat_attrs(self.ds) + assert list(out.attrs.keys()) == ["dog:source"] + class TestStackNan: From c97b93b5b2338dee4a29d2bc0065ad97fec1c88a Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 16 Dec 2024 18:48:36 -0500 Subject: [PATCH 2/4] Opt in agg folder level - version opt in non-sim --- src/xscen/catutils.py | 3 ++- src/xscen/data/file_schema.yml | 10 +++++----- tests/test_catutils.py | 18 +++++++++++++----- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/xscen/catutils.py b/src/xscen/catutils.py index 9be103fd..2238a798 100644 --- a/src/xscen/catutils.py +++ b/src/xscen/catutils.py @@ -1010,7 +1010,8 @@ def _get_needed_fields(schema: dict): needed.add(level) elif isinstance(level, list): for lvl in level: - needed.add(lvl) + if not (lvl.startswith("(") and lvl.endswith(")")): + needed.add(lvl) elif not (isinstance(level, dict) and list(level.keys()) == ["text"]): raise ValueError( f"Invalid schema with unknown {level} of type {type(level)}." diff --git a/src/xscen/data/file_schema.yml b/src/xscen/data/file_schema.yml index 62b29660..4f40591a 100644 --- a/src/xscen/data/file_schema.yml +++ b/src/xscen/data/file_schema.yml @@ -13,7 +13,7 @@ # # There are four ways to specify a folder name to use: # - < facet > # The value of the facet. # - (< facet >) # Same, but if the facet is missing, this level is skipped, resulting in a tree of a different depth. -# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can't be optional. +# - [< facet >, < facet >, ...]: # The folder name consists in more than one facet, concatenated with a "_" by default. They can be optional. # - text: < value > # A fixed string # filename: # The file name schema, a list of facet names. If a facet is empty, it will be skipped. Elements will be separated by "_". # # The special "DATES" facet will be replaced by the most concise way found to define the temporal range covered by the file. @@ -32,7 +32,7 @@ original-non-sims: - type - domain - institution - - [source, version] + - [source, (version)] - (member) - frequency - variable @@ -91,7 +91,7 @@ original-hydro-reconstruction: - hydrology_source - (hydrology_member) - institution - - [source, version] + - [source, (version)] - (member) - frequency - variable @@ -198,7 +198,7 @@ derived-reconstruction: folders: - type - institution - - [source, version] + - [source, (version)] - (member) - domain - processing_level @@ -260,7 +260,7 @@ derived-hydro-reconstruction: - hydrology_source - (hydrology_member) - institution - - [source, version] + - [source, (version)] - (member) - domain - processing_level diff --git a/tests/test_catutils.py b/tests/test_catutils.py index 6748c3c2..639171d6 100644 --- a/tests/test_catutils.py +++ b/tests/test_catutils.py @@ -228,20 +228,28 @@ def test_build_path(samplecat): ) in df.new_path.values -def test_build_path_ds(): +@pytest.mark.parametrize("hasver", [True, False]) +def test_build_path_ds(hasver): ds = xr.tutorial.open_dataset("air_temperature") ds = ds.assign(time=xr.cftime_range("0001-01-01", freq="6h", periods=ds.time.size)) ds.attrs.update(source="source", institution="institution") + if hasver: + ds.attrs["version"] = "v1" new_path = cu.build_path( ds, schemas={ - "folders": ["source", "institution", ["variable", "xrfreq"]], + "folders": [["source", "(version)"], "institution", ["variable", "xrfreq"]], "filename": ["source", "institution", "variable", "frequency", "DATES"], }, ) - assert new_path == Path( - "source/institution/air_6h/source_institution_air_6hr_0001-0002" - ) + if hasver: + assert new_path == Path( + "source_v1/institution/air_6h/source_institution_air_6hr_0001-0002" + ) + else: + assert new_path == Path( + "source/institution/air_6h/source_institution_air_6hr_0001-0002" + ) def test_build_path_multivar(samplecat): From a0f791431690a60425c5a3882552611d8a39a295 Mon Sep 17 00:00:00 2001 From: Pascal Bourgault Date: Mon, 16 Dec 2024 18:53:44 -0500 Subject: [PATCH 3/4] upd changes --- CHANGELOG.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index edd16644..db282979 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,11 @@ v0.9.2 (unreleased) ------------------- Contributors to this version: Juliette Lavoie (:user:`juliettelavoie`), Pascal Bourgault (:user:`aulemahal`). +Breaking changes +^^^^^^^^^^^^^^^^ +* Version facet is now optional in default filepath schemas for non-simulations a with "source_version" level. (:issue:`500`, :pull:`501`). +* Catalog attributes are removed by default in ``save_to_zarr`` and ``save_to_netcdf``. Catalog attributes are those added from the catalog columns by ``to_dataset``, ``to_dataset_dict`` and ``extract_dataset``, which have names prefixed with ``cat:``. (:issue:`499`, :pull:`501`). + Bug fixes ^^^^^^^^^ * Fixed bug with reusing weights. (:issue:`411`, :pull:`414`). From d9204dead0f4875ce791923c3be78e1a5355ce39 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 17 Dec 2024 00:02:12 +0000 Subject: [PATCH 4/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/test_catutils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_catutils.py b/tests/test_catutils.py index a41edfe7..f3e561f7 100644 --- a/tests/test_catutils.py +++ b/tests/test_catutils.py @@ -272,7 +272,7 @@ def test_pattern_from_schema(samplecat): res = [cu._compile_pattern(patt).parse(p) for patt in patts] assert any(res) - + @pytest.mark.parametrize("hasver", [True, False]) def test_build_path_ds(hasver): ds = xr.tutorial.open_dataset("air_temperature")