Skip to content

Commit

Permalink
update metadata ingestion with README
Browse files Browse the repository at this point in the history
  • Loading branch information
earthpulse committed Mar 14, 2024
1 parent 57af761 commit f2f3ffb
Show file tree
Hide file tree
Showing 21 changed files with 234 additions and 92 deletions.
3 changes: 3 additions & 0 deletions api/api/routers/datasets/update_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class UpdateBody(BaseModel):
authors: Optional[List[str]] = None
source: Optional[str] = None
license: Optional[str] = None
thumbnail: Optional[str] = None


@router.put(
Expand All @@ -51,6 +52,7 @@ def update(
- authors: the author or authors of the dataset.
- license: the license of the dataset.
- source: the source of the dataset.
- thumbnail: the thumbnail of the dataset.
"""
try:
return update_dataset(
Expand All @@ -62,6 +64,7 @@ def update(
body.license,
body.tags,
body.description,
body.thumbnail,
)
except Exception as e:
logger.exception("datasets:ingest")
Expand Down
72 changes: 14 additions & 58 deletions api/api/routers/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,67 +21,23 @@
@router.get("", include_in_schema=False)
def migrate_db(isAdmin: bool = Depends(admin_key_auth)):
# return "Done"

db = get_db()
collections = db.list_collection_names()
# s3 = get_client()
# boto = get_boto3_client() # Boto3Repo()
# create a backup of the changed collections
collection_name = "users-bck"
if not collection_name in collections:
db[collection_name].insert_many(db["users"].find())
# update users
for user in db["users"].find():
if "id" not in user:
db["users"].update_one(
{"_id": user["_id"]}, {"$set": {"id": str(user["_id"])}}
)

# update datasets
# - create files
# - create version
# for dataset in db["datasets"].find():
# if (
# "size" not in dataset
# ): # copy large files takes a while and api timesout, make sure this can be run multiple times
# continue
# size = dataset["size"]
# dataset_id = dataset["id"]
# if dataset["quality"] == 0:
# files = []
# for f in dataset["files"]:
# files.append(
# File(
# name=f["name"],
# size=f["size"],
# checksum=f["checksum"],
# version=1,
# versions=[1],
# )
# )
# new_object_name = f"{dataset_id}/{f['name']}_1"
# current_name = f"{dataset_id}/{f['name']}"
# if size < 1024 * 1024 * 5:
# # minio errors when copying files larger than 5GB
# s3.copy_object(
# bucket, new_object_name, CopySource(bucket, current_name)
# )
# else:
# config = TransferConfig(multipart_threshold=5 * 1024 * 1024) # 5Mb
# copy_source = {"Bucket": bucket, "Key": current_name}
# boto.copy(copy_source, bucket, new_object_name, Config=config)
# files_id = ObjectId()
# data = Files(id=str(files_id), dataset=dataset_id, files=files).model_dump()
# data["_id"] = files_id
# db["files"].insert_one(data)
# dataset["files"] = str(files_id)
# version = Version(version_id=1, size=size).model_dump()
# dataset["versions"] = [version]
# del dataset["size"]
# updated_dataset = (
# Dataset(**dataset).model_dump()
# if dataset["quality"] == 0
# else STACDataset(**dataset).model_dump()
# )
# updated_dataset["_id"] = dataset["_id"]
# db["datasets"].delete_one({"_id": dataset["_id"]})
# db["datasets"].insert_one(updated_dataset)
collection_name = "datasets-bck"
if not collection_name in collections:
db[collection_name].insert_many(db["datasets"].find())
for dataset in db["datasets"].find():
db["datasets"].update_one({"_id": dataset["_id"]}, {"$set": {"thumbnail": ""}})
# update models
collection_name = "models-bck"
if not collection_name in collections:
db[collection_name].insert_many(db["models"].find())
for model in db["models"].find():
db["models"].update_one({"_id": model["_id"]}, {"$set": {"thumbnail": ""}})

return "Done"
3 changes: 3 additions & 0 deletions api/api/routers/models/update_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class UpdateBody(BaseModel):
authors: Optional[List[str]] = None
source: Optional[str] = None
license: Optional[str] = None
thumbnail: Optional[str] = None


@router.put("/{model_id}", summary="Update a model", responses=update_model_responses)
Expand All @@ -48,6 +49,7 @@ def update(
- authors: the author or authors of the model.
- license: the license of the model.
- source: the source of the model.
- thumbnail: the thumbnail of the model.
"""
try:
return update_model(
Expand All @@ -59,6 +61,7 @@ def update(
body.license,
body.tags,
body.description,
body.thumbnail,
)
except Exception as e:
logger.exception("models:ingest")
Expand Down
1 change: 1 addition & 0 deletions api/api/src/models/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class Model(BaseModel):
files: str
versions: List[Version] = []
description: str = ""
thumbnail: str = ""
tags: List[str] = []
createdAt: datetime = datetime.now()
updatedAt: datetime = datetime.now()
Expand Down
1 change: 1 addition & 0 deletions api/api/src/usecases/datasets/delete_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
def delete_dataset(name):
db_repo, files_repo, os_repo = DatasetsDBRepo(), FilesDBRepo(), OSRepo()
dataset = retrieve_dataset_by_name(name)
# BUG: this throws an error if dataset has no files
for file in files_repo.retrieve_files(dataset.files)[0]["files"]:
os_repo.delete(dataset.id, f"{file['name']}_{file['version']}")
db_repo.delete_files(dataset.files)
Expand Down
5 changes: 4 additions & 1 deletion api/api/src/usecases/datasets/update_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ def toggle_like_dataset(dataset_id, user):
return "done"


def update_dataset(dataset_id, user, name, authors, source, license, tags, description):
def update_dataset(
dataset_id, user, name, authors, source, license, tags, description, thumbnail
):
dataset = retrieve_owned_dataset(dataset_id, user.uid)
# validate name
if name:
Expand Down Expand Up @@ -57,6 +59,7 @@ def update_dataset(dataset_id, user, name, authors, source, license, tags, descr
authors=authors if authors is not None else dataset.authors,
source=source if source is not None else dataset.source,
license=license if license is not None else dataset.license,
thumbnail=thumbnail if thumbnail is not None else dataset.thumbnail,
)
updated_dataset = Dataset(**data) if data["quality"] == 0 else STACDataset(**data)
# update dataset in db
Expand Down
5 changes: 4 additions & 1 deletion api/api/src/usecases/models/update_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@ def toggle_like_model(model_id, user):
return "done"


def update_model(model_id, user, name, authors, source, license, tags, description):
def update_model(
model_id, user, name, authors, source, license, tags, description, thumbnail
):
model = retrieve_owned_model(model_id, user.uid)
# validate name
if name:
Expand Down Expand Up @@ -50,6 +52,7 @@ def update_model(model_id, user, name, authors, source, license, tags, descripti
authors=authors if authors is not None else model.authors,
source=source if source is not None else model.source,
license=license if license is not None else model.license,
thumbnail=thumbnail if thumbnail is not None else model.thumbnail,
)
updated_model = Model(**data)
# update model in db
Expand Down
13 changes: 13 additions & 0 deletions eotdl/eotdl/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
import typer
import os

from .commands import auth, datasets, models
from .repos import APIRepo
from . import __version__

app = typer.Typer(help="Welcome to EOTDL. Learn more at https://www.eotdl.com/")
Expand All @@ -17,5 +20,15 @@ def version():
typer.echo(f"EOTDL Version: {__version__}")


@app.command()
def api():
"""
Get EOTDL API URL and info.
"""
repo = APIRepo()
typer.echo(f"EOTDL API URL: {repo.url}")
typer.echo(repo.get_info())


if __name__ == "__main__":
app()
15 changes: 14 additions & 1 deletion eotdl/eotdl/commands/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,18 @@ def ingest(
"--verbose",
help="Verbose output. This will print the progress of the ingestion",
),
foce_metadata_update: bool = typer.Option(
False,
"--force-metadata-update",
"-f",
help="Force metadata update even if it already exists. Will overwrite the current metadata in EOTDL",
),
sync_metadata: bool = typer.Option(
False,
"--sync-metadata",
"-s",
help="Sync local metadata with the EOTDL. Will overwrite the local metadata",
),
):
"""
Ingest a model to the EOTDL.
Expand All @@ -63,6 +75,7 @@ def ingest(
- authors: the author or authors of the model\n
- license: the license of the model\n
- source: the source of the model\n
- thumbnail: an image to use as the thumbnail of the dataset in the website\n
\n
If using --verbose, it will print the progress of the ingestion.
\n\n
Expand All @@ -71,7 +84,7 @@ def ingest(
$ eotdl models ingest --path /path/to/folder-with-model --verbose True
"""
try:
ingest_model(path, verbose, typer.echo)
ingest_model(path, verbose, typer.echo, foce_metadata_update, sync_metadata)
except Exception as e:
typer.echo(e)

Expand Down
2 changes: 1 addition & 1 deletion eotdl/eotdl/datasets/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def download_dataset(
logger("To download assets, set assets=True or -a in the CLI.")
if verbose:
logger("Generating README.md ...")
generate_metadata(download_path, dataset)
generate_metadata(download_path, dataset)
if verbose:
logger("Done")
return download_path
Expand Down
26 changes: 13 additions & 13 deletions eotdl/eotdl/datasets/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,30 +57,30 @@ def ingest_folder(
):
repo = DatasetsAPIRepo()
try:
readme = frontmatter.load(folder.joinpath("README.md"))
metadata, content = readme.metadata, readme.content
metadata = Metadata(**metadata)
except FileNotFoundError:
# load metadata (legacy)
metadata = (
yaml.safe_load(open(folder.joinpath("metadata.yml"), "r").read()) or {}
)
metadata = Metadata(**metadata)
content = None
except FileNotFoundError:
readme = frontmatter.load(folder.joinpath("README.md"))
metadata, content = readme.metadata, readme.content
metadata = Metadata(**metadata)
except Exception as e:
raise Exception("Error loading metadata: " + str(e))
# retrieve dataset (create if doesn't exist)
dataset = retrieve_dataset(metadata, user)
if content:
content = markdown.markdown(content)
print(content)
return
update_metadata = check_metadata(
dataset, metadata, content, force_metadata_update, sync_metadata, folder
)
if content and update_metadata:
update_dataset(dataset["id"], content, user)
# ingest files
update_metadata = True
if "description" in dataset:
# do not do this if the dataset is new, only if it already exists
update_metadata = check_metadata(
dataset, metadata, content, force_metadata_update, sync_metadata, folder
)
if update_metadata:
update_dataset(dataset["id"], metadata, content, user)
return ingest_files(
repo, dataset["id"], folder, verbose, logger, user, endpoint="datasets"
)
Expand All @@ -99,7 +99,7 @@ def check_metadata(
):
if not force_metadata_update and not sync_metadata:
raise Exception(
"The provided metadata is not consistent with the current metadata. Use -m to force metadata update or -s to sync your local metadata."
"The provided metadata is not consistent with the current metadata. Use -f to force metadata update or -s to sync your local metadata."
)
if force_metadata_update:
return True
Expand Down
4 changes: 4 additions & 0 deletions eotdl/eotdl/datasets/metadata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from pydantic import BaseModel, validator
from typing import List
import markdownify
from pathlib import Path


class Metadata(BaseModel):
Expand Down Expand Up @@ -37,4 +38,7 @@ def generate_metadata(download_path, dataset):
f.write(f" - {author}\n")
f.write("---\n")
f.write(markdownify.markdownify(dataset["description"], heading_style="ATX"))
# remove metadata.yml if exists
if Path(download_path + "/metadata.yml").exists():
Path(download_path + "/metadata.yml").unlink()
return download_path + "/README.md"
12 changes: 10 additions & 2 deletions eotdl/eotdl/datasets/update.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
from ..repos import DatasetsAPIRepo


def update_dataset(dataset_id, content, user):
def update_dataset(dataset_id, metadata, content, user):
repo = DatasetsAPIRepo()
data, error = repo.update_dataset(dataset_id, content, user)
data, error = repo.update_dataset(
dataset_id,
metadata.authors,
metadata.source,
metadata.license,
metadata.thumbnail,
content,
user,
)
if error:
raise Exception(error)
return data
2 changes: 1 addition & 1 deletion eotdl/eotdl/files/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpo
parts,
endpoint,
)
files_repo.complete_upload(user, upload_id, version, endpoint)
data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
# ingest new small files in batches
if len(upload_files) > 0:
logger("generating batches...")
Expand Down
10 changes: 7 additions & 3 deletions eotdl/eotdl/models/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .retrieve import retrieve_model, retrieve_model_files
from ..shared import calculate_checksum
from ..repos import FilesAPIRepo
from .metadata import generate_metadata


@with_auth
Expand Down Expand Up @@ -75,9 +76,6 @@ def download_model(
)
# if calculate_checksum(dst_path) != checksum:
# logger(f"Checksum for {file} does not match")
if verbose:
logger("Done")
return "/".join(dst_path.split("/")[:-1])
else:
raise NotImplementedError("Downloading a STAC model is not implemented")
# logger("Downloading STAC metadata...")
Expand Down Expand Up @@ -108,3 +106,9 @@ def download_model(
# else:
# logger("To download assets, set assets=True or -a in the CLI.")
# return Outputs(dst_path=path)
if verbose:
logger("Generating README.md ...")
generate_metadata(download_path, model)
if verbose:
logger("Done")
return download_path
Loading

0 comments on commit f2f3ffb

Please sign in to comment.