Skip to content

Commit

Permalink
Merge pull request #7 from ecmwf/feature/datasets-publish
Browse files Browse the repository at this point in the history
support for "anemoi-datasets publish"
  • Loading branch information
floriankrb authored Oct 11, 2024
2 parents 4ff8b9a + be24673 commit dacd9e1
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 10 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ Keep it human-readable, your future self will thank you!

- Add anemoi-transform link to documentation
- CI workflows to check for updates in the changelog and the documentation.
- Support for "anemoi-datasets publish"
- Added set from file (python only)


### Changed
- Replaces the deploy workflow with cd-pypi

Expand Down
4 changes: 4 additions & 0 deletions src/anemoi/registry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ def config():
return config.get("registry")


def publish_dataset(*args, **kwargs):
return Dataset.publish(*args, **kwargs)


from .entry.dataset import DatasetCatalogueEntry as Dataset
from .entry.dataset import DatasetCatalogueEntryList as DatasetsList
from .entry.experiment import ExperimentCatalogueEntry as Experiment
Expand Down
5 changes: 4 additions & 1 deletion src/anemoi/registry/entry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ class CatalogueEntry:
def url(self):
return f"{config()['web_url']}/{self.collection}/{self.key}"

def load_from_path(self, path):
raise NotImplementedError("Subclasses must implement this method")

def __init__(self, key=None, path=None, must_exist=True):
assert key is not None or path is not None, "key or path must be provided"
assert key is None or path is None, "key and path are mutually exclusive"
Expand Down Expand Up @@ -89,7 +92,7 @@ def register(self, overwrite=False, ignore_existing=True):
return self.rest_collection.post(self.record)
except AlreadyExists:
if overwrite is True:
LOG.warning(f"{self.key} already exists. Deleting existing one to overwrite it.")
LOG.warning(f"{self.key} already exists. Overwriting existing one.")
return self.rest_item.put(self.record)
if ignore_existing:
LOG.info(f"{self.key} already exists. Ok.")
Expand Down
44 changes: 35 additions & 9 deletions src/anemoi/registry/entry/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,25 @@ class DatasetCatalogueEntry(CatalogueEntry):
collection = COLLECTION
main_key = "name"

@classmethod
def publish(cls, path):
PLATFORM = config()["datasets_platform"]
STATUS = "experimental"

entry = DatasetCatalogueEntry(path=path)
entry.register()
entry.set_status(STATUS)

recipe = entry.record["metadata"].get("recipe", {})
if recipe:
entry.set_recipe(recipe)
else:
LOG.warning("No recipe found in metadata.")

target = entry.build_location_path(PLATFORM)
entry.upload(path, target, platform=PLATFORM)
entry.add_location(PLATFORM, target)

def set_status(self, status):
self.patch([{"op": "add", "path": "/status", "value": status}])

Expand Down Expand Up @@ -127,13 +146,20 @@ def transfer(self, task, source_path, target, resume, threads):
raise
task.unregister()

def set_recipe(self, file):
if not os.path.exists(file):
raise FileNotFoundError(f"Recipe file not found: {file}")
if not file.endswith(".yaml"):
LOG.warning("Recipe file extension is not .yaml")
with open(file) as f:
recipe = yaml.safe_load(f)
def set_recipe(self, recipe):
# only for backward compatibility
# to support old datasets where the recipe was not stored in the metadata
# this is not needed for new datasets and will be removed in the future
if not isinstance(recipe, dict):
if not os.path.exists(recipe):
raise FileNotFoundError(f"Recipe file not found: {recipe}")
if not recipe.endswith(".yaml"):
LOG.warning("Recipe file extension is not .yaml")
with open(recipe) as f:
recipe = yaml.safe_load(f)
assert isinstance(recipe, dict), f"Recipe must be a dictionary, got {type(recipe)}"
# end of backward compatibility

self.patch([{"op": "add", "path": "/recipe", "value": recipe}])

def load_from_path(self, path):
Expand All @@ -142,9 +168,9 @@ def load_from_path(self, path):
if not path.startswith("/") and not path.startswith("s3://"):
LOG.warning(f"Dataset path is not absolute: {path}")
if not os.path.exists(path) and not path.startswith("s3://"):
LOG.warning(f"Dataset path does not exist: {path}")
raise ValueError(f"Dataset path does not exist: {path}")
if not path.endswith(".zarr") or path.endswith(".zip"):
LOG.warning("Dataset path extension is neither .zarr nor .zip")
raise ValueError(f"Dataset path extension is not supported ({path})")

name, _ = os.path.splitext(os.path.basename(path))

Expand Down

0 comments on commit dacd9e1

Please sign in to comment.