diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..74bdac0 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,6 @@ +# CODEOWNERS file + +# Protect workflow files +/.github/ @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry +/.pre-commit-config.yaml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry +/pyproject.toml @theissenhelen @jesperdramsch @gmertes @b8raoult @floriankrb @anaprietonem @HCookie @JPXKQX @mchantry diff --git a/CHANGELOG.md b/CHANGELOG.md index e06be14..ac62f51 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ Keep it human-readable, your future self will thank you! - Support for "anemoi-datasets publish" - Added set from file (python only) - Added 'update' command +- Force full paths when registering +- Added naming conventions ### Changed diff --git a/docs/index.rst b/docs/index.rst index 1760c9b..c74c591 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -24,6 +24,7 @@ of the *Anemoi* packages. - :doc:`installing` - :doc:`configuring` +- :doc:`naming-conventions` .. toctree:: :maxdepth: 1 @@ -31,6 +32,7 @@ of the *Anemoi* packages. installing configuring + naming-conventions **Command line tool** diff --git a/docs/naming-conventions.rst b/docs/naming-conventions.rst new file mode 100644 index 0000000..ffd605f --- /dev/null +++ b/docs/naming-conventions.rst @@ -0,0 +1,100 @@ +.. _naming-conventions: + +############################ + Dataset naming conventions +############################ + +A dataset name is a string used to identify a dataset. It is designed to +be human readable and is *not* designed to be parsed and splitted into +parts. + +To ensure consistency, a dataset name should follow the following rules: + + - All lower case. + - Only letters and numbers and dashes ``-`` are allowed. + - No underscore ``_`` and no dot ``.`` and no upper case letter and + no other special character (``@``, ``#``, ``*`` etc.). + +Additionlly, a dataset name is built from different parts joined with +``-`` as follows (each part can contain additional ``-``): + +.. code:: + + purpose-content-source-resolution-start-year-end-year-frequency-version[-extra-str] + +.. note:: + + This is the current naming conventions for datasets in the Anemoi + registry. It will need to be updated and adapted as more datasets are + added. The part **purpose** is especially difficult to define for + some datasets and may be revisited. + +The tables below provides more details and some examples. + +.. list-table:: Dataset naming conventions + :widths: 20 80 + :header-rows: 1 + + - - Component + - Description + + - - **purpose** + + - Can be `aifs` because the data is used to train the AIFS model. + Is also sometime `metno` for data from the Norwegian + Meteorological Institute. This definition may need to be + revisited. + + - - **content** + + - The content of the dataset CAN have four parts, such as: + *class-type-stream-expver* + + - **class**: od Operational archive (*class* is a MARS + keyword) + - **type**: an Analysis (*type* is a MARS keyword) + - **stream**: oper Atmospheric model (*stream* is a MARS + keyword) + - **expver**: 0001 (operational model) + + - - **source** + - mars (when data is from MARS), could be *opendap* or other. + + - - **resolution** + - o96 (could be : n320, 0p2 for 0.2 degree) + + - - **start-year** + - 1979 if the first validity time is in 1979. + + - - **end-year** + + - 2020 if the first validity time is in 2020. Notice that if the + dataset is from 18.04.2020 to 19.07.2020, the star-year and + end-year are both 2020. For instance in + aifs-od-an-oper-0001-mars-o96-2020-2020-6h-v5 + + - - **frequency** + - 1h (could be : 6h) + + - - **version** + + - This is the version of the content of the dataset, e.g. which + variables, levels, etc. This is not the version of the format. + There must be a "v" before the version number. The "v" is not + part of the version number. For instance ...-v5 is the fifth + version of the content of the dataset. + + - - **extra-str** + + - Experimental datasets can have additional text in the name. + This extra string can contain additional `-`. It provides + additional information about the content of dataset. + +.. list-table:: Examples + :widths: 100 + + - - aifs-od-an-oper-0001-mars-o96-1979-2022-1h-v5 + - - aifs-ea-an-oper-0001-mars-o96-1979-2022-6h-v6 + - - aifs-ea-an-enda-0001-mars-o96-1979-2022-6h-v6-recentered-on-oper + - - aifs-ea-an-oper-0001-mars-n320-1979-2022-6h-v4 + - - inca-an-oper-0001-gridefix-1km-2023-2024-10m-v1 diff --git a/src/anemoi/registry/entry/dataset.py b/src/anemoi/registry/entry/dataset.py index 6d1a2b6..cb945c4 100644 --- a/src/anemoi/registry/entry/dataset.py +++ b/src/anemoi/registry/entry/dataset.py @@ -12,6 +12,7 @@ import yaml from anemoi.datasets import open_dataset from anemoi.utils.humanize import when +from anemoi.utils.sanitise import sanitise from anemoi.registry import config from anemoi.registry.rest import RestItemList @@ -68,6 +69,10 @@ def build_location_path(self, platform, uri_pattern=None): return uri_pattern.format(name=self.key) def add_location(self, platform, path): + if not path.startswith("s3://"): + path = os.path.abspath(path) + path = os.path.normpath(path) + LOG.debug(f"Adding location to {platform}: {path}") self.patch([{"op": "add", "path": f"/locations/{platform}", "value": {"path": path}}]) return path @@ -146,27 +151,28 @@ def transfer(self, task, source_path, target, resume, threads): raise task.unregister() - def set_recipe(self, recipe): - # only for backward compatibility - # to support old datasets where the recipe was not stored in the metadata - # this is not needed for new datasets and will be removed in the future - if not isinstance(recipe, dict): - if not os.path.exists(recipe): - raise FileNotFoundError(f"Recipe file not found: {recipe}") - if not recipe.endswith(".yaml"): - LOG.warning("Recipe file extension is not .yaml") - with open(recipe) as f: - recipe = yaml.safe_load(f) - assert isinstance(recipe, dict), f"Recipe must be a dictionary, got {type(recipe)}" - # end of backward compatibility - - self.patch([{"op": "add", "path": "/recipe", "value": recipe}]) + def _file_or_dict(self, file): + if isinstance(file, dict): + return file + if not file.endswith(".yaml"): + LOG.warning("Recipe file extension is not .yaml") + with open(file) as f: + return yaml.safe_load(f) + + def set_recipe(self, file): + recipe = self._file_or_dict(file) + self.patch([{"op": "add", "path": "/metadata/recipe", "value": sanitise(recipe)}]) + + def set_variables_metadata(self, file): + variables_metadata = self._file_or_dict(file) + self.patch([{"op": "add", "path": "/metadata/variables_metadata", "value": variables_metadata}]) def load_from_path(self, path): import zarr if not path.startswith("/") and not path.startswith("s3://"): LOG.warning(f"Dataset path is not absolute: {path}") + path = os.path.abspath(path) if not os.path.exists(path) and not path.startswith("s3://"): raise ValueError(f"Dataset path does not exist: {path}") if not path.endswith(".zarr") or path.endswith(".zip"): diff --git a/src/anemoi/registry/rest.py b/src/anemoi/registry/rest.py index e1d6ff0..00a4aa2 100644 --- a/src/anemoi/registry/rest.py +++ b/src/anemoi/registry/rest.py @@ -143,7 +143,7 @@ def raise_for_status(self, r, errors={}): exception_handler = errors.get(e.response.status_code) errcode = e.response.status_code - LOG.debug("HTTP error: ", errcode, exception_handler) + LOG.debug("HTTP error: %s %s", errcode, exception_handler) if exception_handler: raise exception_handler(e) else: