update metadata ingestion with README

earthpulse · Mar 14, 2024 · f2f3ffb · f2f3ffb
1 parent 57af761
commit f2f3ffb
Show file tree

Hide file tree

Showing 21 changed files with 234 additions and 92 deletions.
diff --git a/api/api/routers/datasets/update_dataset.py b/api/api/routers/datasets/update_dataset.py
@@ -33,6 +33,7 @@ class UpdateBody(BaseModel):
     authors: Optional[List[str]] = None
     source: Optional[str] = None
     license: Optional[str] = None
+    thumbnail: Optional[str] = None
 
 
 @router.put(
@@ -51,6 +52,7 @@ def update(
     - authors: the author or authors of the dataset.
     - license: the license of the dataset.
     - source: the source of the dataset.
+    - thumbnail: the thumbnail of the dataset.
     """
     try:
         return update_dataset(
@@ -62,6 +64,7 @@ def update(
             body.license,
             body.tags,
             body.description,
+            body.thumbnail,
         )
     except Exception as e:
         logger.exception("datasets:ingest")

diff --git a/api/api/routers/migrate.py b/api/api/routers/migrate.py
@@ -21,67 +21,23 @@
 @router.get("", include_in_schema=False)
 def migrate_db(isAdmin: bool = Depends(admin_key_auth)):
     # return "Done"
+
     db = get_db()
     collections = db.list_collection_names()
     # s3 = get_client()
     # boto = get_boto3_client()  # Boto3Repo()
-    # create a backup of the changed collections
-    collection_name = "users-bck"
-    if not collection_name in collections:
-        db[collection_name].insert_many(db["users"].find())
-    # update users
-    for user in db["users"].find():
-        if "id" not in user:
-            db["users"].update_one(
-                {"_id": user["_id"]}, {"$set": {"id": str(user["_id"])}}
-            )
+
     # update datasets
-    #   - create files
-    #   - create version
-    # for dataset in db["datasets"].find():
-    #     if (
-    #         "size" not in dataset
-    #     ):  # copy large files takes a while and api timesout, make sure this can be run multiple times
-    #         continue
-    #     size = dataset["size"]
-    #     dataset_id = dataset["id"]
-    #     if dataset["quality"] == 0:
-    #         files = []
-    #         for f in dataset["files"]:
-    #             files.append(
-    #                 File(
-    #                     name=f["name"],
-    #                     size=f["size"],
-    #                     checksum=f["checksum"],
-    #                     version=1,
-    #                     versions=[1],
-    #                 )
-    #             )
-    #             new_object_name = f"{dataset_id}/{f['name']}_1"
-    #             current_name = f"{dataset_id}/{f['name']}"
-    #             if size < 1024 * 1024 * 5:
-    #                 # minio errors when copying files larger than 5GB
-    #                 s3.copy_object(
-    #                     bucket, new_object_name, CopySource(bucket, current_name)
-    #                 )
-    #             else:
-    #                 config = TransferConfig(multipart_threshold=5 * 1024 * 1024)  # 5Mb
-    #                 copy_source = {"Bucket": bucket, "Key": current_name}
-    #                 boto.copy(copy_source, bucket, new_object_name, Config=config)
-    #         files_id = ObjectId()
-    #         data = Files(id=str(files_id), dataset=dataset_id, files=files).model_dump()
-    #         data["_id"] = files_id
-    #         db["files"].insert_one(data)
-    #         dataset["files"] = str(files_id)
-    #     version = Version(version_id=1, size=size).model_dump()
-    #     dataset["versions"] = [version]
-    #     del dataset["size"]
-    #     updated_dataset = (
-    #         Dataset(**dataset).model_dump()
-    #         if dataset["quality"] == 0
-    #         else STACDataset(**dataset).model_dump()
-    #     )
-    #     updated_dataset["_id"] = dataset["_id"]
-    #     db["datasets"].delete_one({"_id": dataset["_id"]})
-    #     db["datasets"].insert_one(updated_dataset)
+    collection_name = "datasets-bck"
+    if not collection_name in collections:
+        db[collection_name].insert_many(db["datasets"].find())
+    for dataset in db["datasets"].find():
+        db["datasets"].update_one({"_id": dataset["_id"]}, {"$set": {"thumbnail": ""}})
+    # update models
+    collection_name = "models-bck"
+    if not collection_name in collections:
+        db[collection_name].insert_many(db["models"].find())
+    for model in db["models"].find():
+        db["models"].update_one({"_id": model["_id"]}, {"$set": {"thumbnail": ""}})
+
     return "Done"
diff --git a/api/api/routers/models/update_model.py b/api/api/routers/models/update_model.py
@@ -32,6 +32,7 @@ class UpdateBody(BaseModel):
     authors: Optional[List[str]] = None
     source: Optional[str] = None
     license: Optional[str] = None
+    thumbnail: Optional[str] = None
 
 
 @router.put("/{model_id}", summary="Update a model", responses=update_model_responses)
@@ -48,6 +49,7 @@ def update(
     - authors: the author or authors of the model.
     - license: the license of the model.
     - source: the source of the model.
+    - thumbnail: the thumbnail of the model.
     """
     try:
         return update_model(
@@ -59,6 +61,7 @@ def update(
             body.license,
             body.tags,
             body.description,
+            body.thumbnail,
         )
     except Exception as e:
         logger.exception("models:ingest")

diff --git a/api/api/src/models/model.py b/api/api/src/models/model.py
@@ -16,6 +16,7 @@ class Model(BaseModel):
     files: str
     versions: List[Version] = []
     description: str = ""
+    thumbnail: str = ""
     tags: List[str] = []
     createdAt: datetime = datetime.now()
     updatedAt: datetime = datetime.now()

diff --git a/api/api/src/usecases/datasets/delete_dataset.py b/api/api/src/usecases/datasets/delete_dataset.py
@@ -5,6 +5,7 @@
 def delete_dataset(name):
     db_repo, files_repo, os_repo = DatasetsDBRepo(), FilesDBRepo(), OSRepo()
     dataset = retrieve_dataset_by_name(name)
+    # BUG: this throws an error if dataset has no files
     for file in files_repo.retrieve_files(dataset.files)[0]["files"]:
         os_repo.delete(dataset.id, f"{file['name']}_{file['version']}")
     db_repo.delete_files(dataset.files)

diff --git a/api/api/src/usecases/datasets/update_dataset.py b/api/api/src/usecases/datasets/update_dataset.py
@@ -26,7 +26,9 @@ def toggle_like_dataset(dataset_id, user):
     return "done"
 
 
-def update_dataset(dataset_id, user, name, authors, source, license, tags, description):
+def update_dataset(
+    dataset_id, user, name, authors, source, license, tags, description, thumbnail
+):
     dataset = retrieve_owned_dataset(dataset_id, user.uid)
     # validate name
     if name:
@@ -57,6 +59,7 @@ def update_dataset(dataset_id, user, name, authors, source, license, tags, descr
             authors=authors if authors is not None else dataset.authors,
             source=source if source is not None else dataset.source,
             license=license if license is not None else dataset.license,
+            thumbnail=thumbnail if thumbnail is not None else dataset.thumbnail,
         )
     updated_dataset = Dataset(**data) if data["quality"] == 0 else STACDataset(**data)
     # update dataset in db

diff --git a/api/api/src/usecases/models/update_model.py b/api/api/src/usecases/models/update_model.py
@@ -19,7 +19,9 @@ def toggle_like_model(model_id, user):
     return "done"
 
 
-def update_model(model_id, user, name, authors, source, license, tags, description):
+def update_model(
+    model_id, user, name, authors, source, license, tags, description, thumbnail
+):
     model = retrieve_owned_model(model_id, user.uid)
     # validate name
     if name:
@@ -50,6 +52,7 @@ def update_model(model_id, user, name, authors, source, license, tags, descripti
             authors=authors if authors is not None else model.authors,
             source=source if source is not None else model.source,
             license=license if license is not None else model.license,
+            thumbnail=thumbnail if thumbnail is not None else model.thumbnail,
         )
     updated_model = Model(**data)
     # update model in db

diff --git a/eotdl/eotdl/cli.py b/eotdl/eotdl/cli.py
@@ -1,5 +1,8 @@
 import typer
+import os
+
 from .commands import auth, datasets, models
+from .repos import APIRepo
 from . import __version__
 
 app = typer.Typer(help="Welcome to EOTDL. Learn more at https://www.eotdl.com/")
@@ -17,5 +20,15 @@ def version():
     typer.echo(f"EOTDL Version: {__version__}")
 
 
+@app.command()
+def api():
+    """
+    Get EOTDL API URL and info.
+    """
+    repo = APIRepo()
+    typer.echo(f"EOTDL API URL: {repo.url}")
+    typer.echo(repo.get_info())
+
+
 if __name__ == "__main__":
     app()
diff --git a/eotdl/eotdl/commands/models.py b/eotdl/eotdl/commands/models.py
@@ -45,6 +45,18 @@ def ingest(
         "--verbose",
         help="Verbose output. This will print the progress of the ingestion",
     ),
+    foce_metadata_update: bool = typer.Option(
+        False,
+        "--force-metadata-update",
+        "-f",
+        help="Force metadata update even if it already exists. Will overwrite the current metadata in EOTDL",
+    ),
+    sync_metadata: bool = typer.Option(
+        False,
+        "--sync-metadata",
+        "-s",
+        help="Sync local metadata with the EOTDL. Will overwrite the local metadata",
+    ),
 ):
     """
     Ingest a model to the EOTDL.
@@ -63,6 +75,7 @@ def ingest(
     - authors: the author or authors of the model\n
     - license: the license of the model\n
     - source: the source of the model\n
+    - thumbnail: an image to use as the thumbnail of the dataset in the website\n
     \n
     If using --verbose, it will print the progress of the ingestion.
     \n\n
@@ -71,7 +84,7 @@ def ingest(
     $ eotdl models ingest --path /path/to/folder-with-model --verbose True
     """
     try:
-        ingest_model(path, verbose, typer.echo)
+        ingest_model(path, verbose, typer.echo, foce_metadata_update, sync_metadata)
     except Exception as e:
         typer.echo(e)
 

diff --git a/eotdl/eotdl/datasets/download.py b/eotdl/eotdl/datasets/download.py
@@ -96,7 +96,7 @@ def download_dataset(
                 logger("To download assets, set assets=True or -a in the CLI.")
     if verbose:
         logger("Generating README.md ...")
-        generate_metadata(download_path, dataset)
+    generate_metadata(download_path, dataset)
     if verbose:
         logger("Done")
     return download_path

diff --git a/eotdl/eotdl/datasets/ingest.py b/eotdl/eotdl/datasets/ingest.py
@@ -57,30 +57,30 @@ def ingest_folder(
 ):
     repo = DatasetsAPIRepo()
     try:
+        readme = frontmatter.load(folder.joinpath("README.md"))
+        metadata, content = readme.metadata, readme.content
+        metadata = Metadata(**metadata)
+    except FileNotFoundError:
         # load metadata (legacy)
         metadata = (
             yaml.safe_load(open(folder.joinpath("metadata.yml"), "r").read()) or {}
         )
         metadata = Metadata(**metadata)
         content = None
-    except FileNotFoundError:
-        readme = frontmatter.load(folder.joinpath("README.md"))
-        metadata, content = readme.metadata, readme.content
-        metadata = Metadata(**metadata)
     except Exception as e:
         raise Exception("Error loading metadata: " + str(e))
     # retrieve dataset (create if doesn't exist)
     dataset = retrieve_dataset(metadata, user)
     if content:
         content = markdown.markdown(content)
-        print(content)
-        return
-    update_metadata = check_metadata(
-        dataset, metadata, content, force_metadata_update, sync_metadata, folder
-    )
-    if content and update_metadata:
-        update_dataset(dataset["id"], content, user)
-    # ingest files
+    update_metadata = True
+    if "description" in dataset:
+        # do not do this if the dataset is new, only if it already exists
+        update_metadata = check_metadata(
+            dataset, metadata, content, force_metadata_update, sync_metadata, folder
+        )
+    if update_metadata:
+        update_dataset(dataset["id"], metadata, content, user)
     return ingest_files(
         repo, dataset["id"], folder, verbose, logger, user, endpoint="datasets"
     )
@@ -99,7 +99,7 @@ def check_metadata(
     ):
         if not force_metadata_update and not sync_metadata:
             raise Exception(
-                "The provided metadata is not consistent with the current metadata. Use -m to force metadata update or -s to sync your local metadata."
+                "The provided metadata is not consistent with the current metadata. Use -f to force metadata update or -s to sync your local metadata."
             )
         if force_metadata_update:
             return True

diff --git a/eotdl/eotdl/datasets/metadata.py b/eotdl/eotdl/datasets/metadata.py
@@ -1,6 +1,7 @@
 from pydantic import BaseModel, validator
 from typing import List
 import markdownify
+from pathlib import Path
 
 
 class Metadata(BaseModel):
@@ -37,4 +38,7 @@ def generate_metadata(download_path, dataset):
             f.write(f"  - {author}\n")
         f.write("---\n")
         f.write(markdownify.markdownify(dataset["description"], heading_style="ATX"))
+    # remove metadata.yml if exists
+    if Path(download_path + "/metadata.yml").exists():
+        Path(download_path + "/metadata.yml").unlink()
     return download_path + "/README.md"
diff --git a/eotdl/eotdl/datasets/update.py b/eotdl/eotdl/datasets/update.py
@@ -1,9 +1,17 @@
 from ..repos import DatasetsAPIRepo
 
 
-def update_dataset(dataset_id, content, user):
+def update_dataset(dataset_id, metadata, content, user):
     repo = DatasetsAPIRepo()
-    data, error = repo.update_dataset(dataset_id, content, user)
+    data, error = repo.update_dataset(
+        dataset_id,
+        metadata.authors,
+        metadata.source,
+        metadata.license,
+        metadata.thumbnail,
+        content,
+        user,
+    )
     if error:
         raise Exception(error)
     return data
diff --git a/eotdl/eotdl/files/ingest.py b/eotdl/eotdl/files/ingest.py
@@ -134,7 +134,7 @@ def ingest_files(repo, dataset_or_model_id, folder, verbose, logger, user, endpo
                 parts,
                 endpoint,
             )
-            files_repo.complete_upload(user, upload_id, version, endpoint)
+            data, error = files_repo.complete_upload(user, upload_id, version, endpoint)
     # ingest new small files in batches
     if len(upload_files) > 0:
         logger("generating batches...")

diff --git a/eotdl/eotdl/models/download.py b/eotdl/eotdl/models/download.py
@@ -6,6 +6,7 @@
 from .retrieve import retrieve_model, retrieve_model_files
 from ..shared import calculate_checksum
 from ..repos import FilesAPIRepo
+from .metadata import generate_metadata
 
 
 @with_auth
@@ -75,9 +76,6 @@ def download_model(
             )
             # if calculate_checksum(dst_path) != checksum:
             #     logger(f"Checksum for {file} does not match")
-            if verbose:
-                logger("Done")
-        return "/".join(dst_path.split("/")[:-1])
     else:
         raise NotImplementedError("Downloading a STAC model is not implemented")
     #     logger("Downloading STAC metadata...")
@@ -108,3 +106,9 @@ def download_model(
     #     else:
     #         logger("To download assets, set assets=True or -a in the CLI.")
     #     return Outputs(dst_path=path)
+    if verbose:
+        logger("Generating README.md ...")
+    generate_metadata(download_path, model)
+    if verbose:
+        logger("Done")
+    return download_path