Skip to content

Commit

Permalink
1187 migrate collections and spaces as v2 metadata. (#1190)
Browse files Browse the repository at this point in the history
* match other branch

* adding new collection metadata

* str not float

* getting collections for a dataset in v1, fixing metadata for collections

* posts collection name and id

* adding routes for getting collections

* making a method like the one in v1 for self and ancestors.
it will be easier to build a collection hierarchy from this

* sample json for mdata

* posts collection name and id

* building the data for collections

* something works now

* matching with other branch

* methods for migrating collections as metadata

* need to post it as metadata

* change name

* adding the metadata for collections

* adding context url and right endpoint

* getting spaces as well as collections

* change name

* remaning method

* created v2 license based on v1 license details (#1193)

Co-authored-by: Chen Wang <[email protected]>

* removing print statements

* better error logging

---------

Co-authored-by: Dipannita <[email protected]>
Co-authored-by: Chen Wang <[email protected]>
  • Loading branch information
3 people authored Sep 3, 2024
1 parent 2a52c7c commit 4e68f21
Show file tree
Hide file tree
Showing 2 changed files with 339 additions and 1 deletion.
54 changes: 54 additions & 0 deletions scripts/metadata/definitions/collectionv1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
{
"name" : "Collection",
"description" : "Collection information from v1",
"required_for_items": {
"datasets": false,
"files": false
},
"context" : [
{
"collection_name" : "https://schema.org/colname",
"collection_id" : "https://schema.org/colid",
"parent_collection_name": "https://schema.org/parentcolname",
"parent_collection_id": "https://schema.org/parentcolid"
}
],
"fields" : [
{
"name" : "collection_name",
"list" : false,
"widgetType": "TextField",
"config": {
"type" : "str"
},
"required" : false
},
{
"name" : "collection_id",
"list" : false,
"widgetType": "TextField",
"config": {
"type" : "str"
},
"required" : false
},
{
"name" : "parent_collection_name",
"list" : false,
"widgetType": "TextField",
"config": {
"type" : "str"
},
"required" : false
},
{
"name" : "parent_collection_id",
"list" : false,
"widgetType": "TextField",
"config": {
"type" : "str"
},
"required" : false
}
]
}
286 changes: 285 additions & 1 deletion scripts/migration/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,102 @@ def add_v1_space_members_to_v2_group(space, group_id, headers):
)


def get_clowder_v1_user_collections(headers, user_v1):
endpoint = f"{CLOWDER_V1}/api/collections"
response = requests.get(endpoint, headers=headers)
return [col for col in response.json() if col["authorId"] == user_v1["id"]]


def get_clowder_v1_dataset_collections(headers, user_v1, dataset_id):
matching_collections = []
endpoint = f"{CLOWDER_V1}/api/collections/allCollections"
response = requests.get(endpoint, headers=headers)
user_collections = response.json()
for collection in user_collections:
collection_id = collection["id"]
collection_dataset_endpoint = (
f"{CLOWDER_V1}/api/collections/{collection_id}/datasets"
)
try:
dataset_response = requests.get(
collection_dataset_endpoint, headers=headers
)
datasets = dataset_response.json()
for ds in datasets:
if ds["id"] == dataset_id:
matching_collections.append(collection)
except Exception as e:
print("Exception", e)
return matching_collections


def get_clowder_v1_collection(collection_id, headers):
endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
response = requests.get(endpoint, headers=headers)
return response.json()


def get_clowder_v1_collections(collection_ids, headers):
collections = []
for collection_id in collection_ids:
endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
response = requests.get(endpoint, headers=headers)
collections.append(response.json())
return collections


def get_clowder_v1_collection_self_and_ancestors(
collection_id, self_and_ancestors, headers
):
endpoint = f"{CLOWDER_V1}/api/collections/{collection_id}"
response = requests.get(endpoint, headers=headers)
self = response.json()
if self["id"] not in self_and_ancestors:
self_and_ancestors.append(self["id"])
parents_entry = self["parent_collection_ids"]
parents_entry = parents_entry.lstrip("List(")
parents_entry = parents_entry.rstrip(")")
if parents_entry != "":
parents = parents_entry.split(",")
for parent in parents:
# replace empty space
parent = parent.lstrip(" ")
parent = parent.rstrip(" ")
if parent not in self_and_ancestors:
self_and_ancestors.append(parent)
for parent in parents:
parent = parent.lstrip(" ")
parent = parent.rstrip(" ")
if parent != "" and parent is not None:
current_self_and_ancestors = (
get_clowder_v1_collection_self_and_ancestors(
parent, self_and_ancestors, headers=headers
)
)
for col_id in current_self_and_ancestors:
if col_id not in self_and_ancestors:
self_and_ancestors.append(col_id)
return self_and_ancestors


def get_clowder_v1_parent_collection_ids(current_collection_id, headers):
parents = []
all_collections_v1_endpoint = (
f"{CLOWDER_V1}/api/collections/allCollections?limit=0&showAll=true"
)
response = requests.get(all_collections_v1_endpoint, headers=headers)
all_collections = response.json()
for collection in all_collections:
children_entry = collection["child_collection_ids"]
children_entry = children_entry.lstrip("List(")
children_entry = children_entry.rstrip(")")
child_ids = children_entry.split(",")
for child in child_ids:
if child == current_collection_id:
parents.append(collection["id"])
return parents


def create_local_user(user_v1):
"""Create a local user in Clowder v2 if they don't already exist, and generate an API key."""
# Search for the user by email
Expand Down Expand Up @@ -169,10 +265,74 @@ def create_admin_user():
return generate_user_api_key(admin_user, admin_user["password"])


def add_dataset_license(v1_license, headers):
"""Create appropriate license (standard/custom) based on v1 license details"""
license_id = "CC-BY"
# standard licenses
if v1_license["license_type"] == "license2":
if (
not v1_license["ccAllowCommercial"]
and not v1_license["ccAllowDerivative"]
and not v1_license["ccRequireShareAlike"]
):
license_id = "CC BY-NC-ND"
elif (
v1_license["ccAllowCommercial"]
and not v1_license["ccAllowDerivative"]
and not v1_license["ccRequireShareAlike"]
):
license_id = "CC BY-ND"
elif (
not v1_license["ccAllowCommercial"]
and v1_license["ccAllowDerivative"]
and not v1_license["ccRequireShareAlike"]
):
license_id = "CC BY-NC"
elif (
not v1_license["ccAllowCommercial"]
and v1_license["ccAllowDerivative"]
and v1_license["ccRequireShareAlike"]
):
license_id = "CC BY-NC-SA"
elif (
v1_license["ccAllowCommercial"]
and v1_license["ccAllowDerivative"]
and v1_license["ccRequireShareAlike"]
):
license_id = "CC BY-SA"
elif (
v1_license["ccAllowCommercial"]
and v1_license["ccAllowDerivative"]
and not v1_license["ccRequireShareAlike"]
):
license_id = "CC BY"
elif v1_license["license_type"] == "license3":
license_id = "CCO Public Domain Dedication"
else:
# custom license
license_body = {
"name": v1_license["license_text"],
"url": v1_license["license_url"],
"holders": v1_license["holders"],
}
if license_body["url"] == "":
license_body["url"] = "https://dbpedia.org/page/All_rights_reserved"
license_v2_endpoint = f"{CLOWDER_V2}/api/v2/licenses?"
response = requests.post(
license_v2_endpoint, headers=headers, json=license_body
)
print(response.json())
license_id = response.json()["id"]
return license_id


def create_v2_dataset(dataset, headers):
"""Create a dataset in Clowder v2."""
# TODO: GET correct license
dataset_in_v2_endpoint = f"{CLOWDER_V2}/api/v2/datasets?license_id=CC BY"
print("Creating dataset license in Clowder v2.")
v2_license_id = add_dataset_license(dataset["license"], headers)

dataset_in_v2_endpoint = f"{CLOWDER_V2}/api/v2/datasets?license_id={v2_license_id}"
dataset_example = {
"name": dataset["name"],
"description": dataset["description"],
Expand Down Expand Up @@ -439,6 +599,101 @@ def register_migration_extractor():
)


def add_children(collection_hierarchy_json, remaining_collections):
new_json = []
new_remaining_collections = []
for collection in remaining_collections:
collection_parents = collection["parent_collection_ids"]
current_collection_parents = []
for entry in collection_hierarchy_json:
if entry["id"] in collection_parents:
current_collection_parents.append(entry)
print("We got the parents now")
if len(current_collection_parents) > 0:
current_collection_entry = {
"id": collection["id"],
"name": collection["name"],
"parents": current_collection_parents,
}
new_json.append(current_collection_entry)
else:
new_remaining_collections.append(collection)
return new_json, new_remaining_collections


def build_collection_hierarchy(collection_id, headers):
self_and_ancestors = get_clowder_v1_collection_self_and_ancestors(
collection_id=collection_id, self_and_ancestors=[], headers=headers
)
self_and_ancestors_collections = get_clowder_v1_collections(
self_and_ancestors, headers=clowder_headers_v1
)
children = []
remaining_collections = []
for col in self_and_ancestors_collections:
parent_collection_ids = col["parent_collection_ids"]
parent_collection_ids = parent_collection_ids.lstrip("List(")
parent_collection_ids = parent_collection_ids.rstrip(")")
parent_collection_ids = parent_collection_ids.lstrip(" ")
parent_collection_ids = parent_collection_ids.rstrip(" ")
if parent_collection_ids == "":
root_col_entry = {"name": col["name"], "id": col["id"], "parents": []}
children.append(root_col_entry)
else:
remaining_collections.append(col)

while len(remaining_collections) > 0:
children, remaining_collections = add_children(children, remaining_collections)
print("Now we are done")
return children


def build_collection_metadata_for_v1_dataset(dataset_id, user_v1, headers):
dataset_collections = get_clowder_v1_dataset_collections(
headers=headers, user_v1=user_v1, dataset_id=dataset_id
)
return dataset_collections


def build_collection_space_metadata_for_v1_dataset(dataset, user_v1, headers):
dataset_id = dataset["id"]
dataset_collections = get_clowder_v1_dataset_collections(
headers=headers, user_v1=user_v1, dataset_id=dataset_id
)
dataset_spaces = dataset["spaces"]
space_entries = []
for space_id in dataset_spaces:
space_endpoint = f"{CLOWDER_V1}/api/spaces/{space_id}"
response = requests.get(space_endpoint, headers=headers)
space = response.json()
try:
space_entry = {
"id": space["id"],
"name": space["name"],
"creator": space["creator"],
}
space_entries.append(space_entry)
except Exception as e:
print(f"Error in getting space entry.")
print(e)
try:
space_entry = {"id": space["id"], "name": space["name"]}
space_entries.append(space_entry)
except Exception as e:
print(f"Error in getting space entry")
print(e)
collection_data = []
for collection in dataset_collections:
collection_children = build_collection_hierarchy(
collection_id=collection["id"], headers=headers
)
for child in collection_children:
collection_data.append(child)
metadata = {"spaces": space_entries, "collections": collection_data}
print(f"Got space and collection metadata from dataset {dataset_id}")
return metadata


def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
"""Process user resources from Clowder v1 to Clowder v2."""
user_v1_datasets = get_clowder_v1_user_datasets(user_id=user_v1["id"])
Expand Down Expand Up @@ -476,6 +731,35 @@ def process_user_and_resources(user_v1, USER_MAP, DATASET_MAP):
)
if file_v2_id is not None:
add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)
# posting the collection hierarchy as metadata
collection_space_metadata_dict = build_collection_space_metadata_for_v1_dataset(
dataset=dataset, user_v1=user_v1, headers=clowder_headers_v1
)
migration_extractor_collection_metadata = {
"listener": {
"name": "migration",
"version": "1",
"description": "Migration of metadata from Clowder v1 to Clowder v2",
},
"context_url": "https://clowder.ncsa.illinois.edu/contexts/metadata.jsonld",
"content": collection_space_metadata_dict,
"contents": collection_space_metadata_dict,
}
v2_metadata_endpoint = f"{CLOWDER_V2}/api/v2/datasets/{dataset_v2_id}/metadata"
response = requests.post(
v2_metadata_endpoint,
json=migration_extractor_collection_metadata,
headers=clowder_headers_v2,
)
if response.status_code == 200:
print("Successfully added collection info as metadata in v2.")
else:
print(
f"Failed to add collection info as metadata in Clowder v2. Status code: {response.status_code}"
)

if file_v2_id is not None:
add_file_metadata(file, file_v2_id, clowder_headers_v1, user_headers_v2)

return [USER_MAP, DATASET_MAP]

Expand Down

0 comments on commit 4e68f21

Please sign in to comment.