From 4e04d26f398780d097eab4920176df96ee0295c6 Mon Sep 17 00:00:00 2001 From: Florian Pinault Date: Thu, 26 Sep 2024 17:08:15 +0000 Subject: [PATCH 1/3] support for "anemoi-datasets publish" --- src/anemoi/registry/__init__.py | 4 +++ src/anemoi/registry/entry/__init__.py | 5 ++- src/anemoi/registry/entry/dataset.py | 44 +++++++++++++++++++++------ 3 files changed, 43 insertions(+), 10 deletions(-) diff --git a/src/anemoi/registry/__init__.py b/src/anemoi/registry/__init__.py index 763ed82..5e73073 100644 --- a/src/anemoi/registry/__init__.py +++ b/src/anemoi/registry/__init__.py @@ -22,6 +22,10 @@ def config(): return config.get("registry") +def publish_dataset(*args, **kwargs): + return Dataset.publish(*args, **kwargs) + + from .entry.dataset import DatasetCatalogueEntry as Dataset from .entry.dataset import DatasetCatalogueEntryList as DatasetsList from .entry.experiment import ExperimentCatalogueEntry as Experiment diff --git a/src/anemoi/registry/entry/__init__.py b/src/anemoi/registry/entry/__init__.py index 7c2b17d..5dc7782 100644 --- a/src/anemoi/registry/entry/__init__.py +++ b/src/anemoi/registry/entry/__init__.py @@ -36,6 +36,9 @@ class CatalogueEntry: def url(self): return f"{config()['web_url']}/{self.collection}/{self.key}" + def load_from_path(self, path): + raise NotImplementedError("Subclasses must implement this method") + def __init__(self, key=None, path=None, must_exist=True): assert key is not None or path is not None, "key or path must be provided" assert key is None or path is None, "key and path are mutually exclusive" @@ -88,7 +91,7 @@ def register(self, overwrite=False, ignore_existing=True): return self.rest_collection.post(self.record) except AlreadyExists: if overwrite is True: - LOG.warning(f"{self.key} already exists. Deleting existing one to overwrite it.") + LOG.warning(f"{self.key} already exists. Overwriting existing one.") return self.rest_item.put(self.record) if ignore_existing: LOG.info(f"{self.key} already exists. Ok.") diff --git a/src/anemoi/registry/entry/dataset.py b/src/anemoi/registry/entry/dataset.py index ae4b7e3..9a5628d 100644 --- a/src/anemoi/registry/entry/dataset.py +++ b/src/anemoi/registry/entry/dataset.py @@ -36,6 +36,25 @@ class DatasetCatalogueEntry(CatalogueEntry): collection = COLLECTION main_key = "name" + @classmethod + def publish(cls, path): + PLATFORM = config()["datasets_platform"] + STATUS = "experimental" + + entry = DatasetCatalogueEntry(path=path) + entry.register() + entry.set_status(STATUS) + + recipe = entry.record["metadata"].get("recipe", {}) + if recipe: + entry.set_recipe(recipe) + else: + LOG.warning("No recipe found in metadata.") + + target = entry.build_location_path(PLATFORM) + entry.upload(path, target, platform=PLATFORM) + entry.add_location(PLATFORM, target) + def set_status(self, status): self.rest_item.patch([{"op": "add", "path": "/status", "value": status}]) @@ -127,13 +146,20 @@ def transfer(self, task, source_path, target, resume, threads): raise task.unregister() - def set_recipe(self, file): - if not os.path.exists(file): - raise FileNotFoundError(f"Recipe file not found: {file}") - if not file.endswith(".yaml"): - LOG.warning("Recipe file extension is not .yaml") - with open(file) as f: - recipe = yaml.safe_load(f) + def set_recipe(self, recipe): + # only for backward compatibility + # to support old datasets where the recipe was not stored in the metadata + # this is not needed for new datasets and will be removed in the future + if not isinstance(recipe, dict): + if not os.path.exists(recipe): + raise FileNotFoundError(f"Recipe file not found: {recipe}") + if not recipe.endswith(".yaml"): + LOG.warning("Recipe file extension is not .yaml") + with open(recipe) as f: + recipe = yaml.safe_load(f) + assert isinstance(recipe, dict), f"Recipe must be a dictionary, got {type(recipe)}" + # end of backward compatibility + self.rest_item.patch([{"op": "add", "path": "/recipe", "value": recipe}]) def load_from_path(self, path): @@ -142,9 +168,9 @@ def load_from_path(self, path): if not path.startswith("/") and not path.startswith("s3://"): LOG.warning(f"Dataset path is not absolute: {path}") if not os.path.exists(path) and not path.startswith("s3://"): - LOG.warning(f"Dataset path does not exist: {path}") + raise ValueError(f"Dataset path does not exist: {path}") if not path.endswith(".zarr") or path.endswith(".zip"): - LOG.warning("Dataset path extension is neither .zarr nor .zip") + raise ValueError(f"Dataset path extension is not supported ({path})") name, _ = os.path.splitext(os.path.basename(path)) From cdd50bf4ff0735a7239a3fb64a45c88b1890820d Mon Sep 17 00:00:00 2001 From: Florian Pinault Date: Fri, 27 Sep 2024 15:16:37 +0000 Subject: [PATCH 2/3] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c0c2cd4..bd9233f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,7 @@ Keep it human-readable, your future self will thank you! ### Added - CI workflows to check for updates in the changelog and the documentation. +- Support for "anemoi-datasets publish" ### Changed - Replaces the deploy workflow with cd-pypi From be24673a9df6320f4ab8a3bc918ea309313682b8 Mon Sep 17 00:00:00 2001 From: Florian Pinault Date: Fri, 11 Oct 2024 15:50:58 +0200 Subject: [PATCH 3/3] Update dataset.py --- src/anemoi/registry/entry/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/anemoi/registry/entry/dataset.py b/src/anemoi/registry/entry/dataset.py index 103fbf9..6d1a2b6 100644 --- a/src/anemoi/registry/entry/dataset.py +++ b/src/anemoi/registry/entry/dataset.py @@ -160,7 +160,7 @@ def set_recipe(self, recipe): assert isinstance(recipe, dict), f"Recipe must be a dictionary, got {type(recipe)}" # end of backward compatibility - self..patch([{"op": "add", "path": "/recipe", "value": recipe}]) + self.patch([{"op": "add", "path": "/recipe", "value": recipe}]) def load_from_path(self, path): import zarr