Skip to content

Commit

Permalink
Merge pull request #177 from catalyst-cooperative/zenodo-archiver
Browse files Browse the repository at this point in the history
Add zenodo archiving script
  • Loading branch information
e-belfer authored Sep 20, 2024
2 parents ff0f41f + 68ab0d9 commit 653ae7a
Show file tree
Hide file tree
Showing 3 changed files with 126 additions and 1 deletion.
12 changes: 12 additions & 0 deletions .github/workflows/save_daily_metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,31 @@ jobs:

- shell: bash -l {0}
name: Save Github Metrics
id: github
env:
API_TOKEN_GITHUB: ${{ secrets.USAGE_STATS_ACCESS_TOKEN }}
run: |
python src/usage_metrics/scripts/save_github_metrics.py
- shell: bash -l {0}
name: Save Kaggle Metrics
id: kaggle
# Run even if prior metric saving fails
if: "!cancelled()"
env:
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }}
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }}
run: |
python src/usage_metrics/scripts/save_kaggle_metrics.py
- shell: bash -l {0}
name: Save Zenodo Metrics
id: zenodo
# Run even if prior metric saving fails
if: "!cancelled()"
run: |
python src/usage_metrics/scripts/save_zenodo_metrics.py
- name: Inform the Codemonkeys
uses: 8398a7/action-slack@v3
with:
Expand Down
2 changes: 1 addition & 1 deletion src/usage_metrics/scripts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
"""Module contains assets that extract raw data."""

from . import save_github_metrics, save_kaggle_metrics
from . import save_github_metrics, save_kaggle_metrics, save_zenodo_metrics
113 changes: 113 additions & 0 deletions src/usage_metrics/scripts/save_zenodo_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
"""This script pull github traffic metrics and saves them to a GC Bucket."""

import logging
import sys
from datetime import date, datetime
from typing import Annotated

import pandas as pd
import requests
from google.cloud import storage
from pydantic import BaseModel, StringConstraints

Doi = Annotated[str, StringConstraints(pattern=r"10\.5281/zenodo\.\d+")]
SandboxDoi = Annotated[str, StringConstraints(pattern=r"10\.5072/zenodo\.\d+")]

logger = logging.getLogger()
logging.basicConfig(level="INFO")


class ZenodoStats(BaseModel):
"""Pydantic model representing Zenodo usage stats.
See https://developers.zenodo.org/#representation.
"""

downloads: int
unique_downloads: int
views: int
unique_views: int
version_downloads: int
version_unique_downloads: int
version_unique_views: int
version_views: int


class CommunityMetadata(BaseModel):
"""Pydantic model representing Zenodo deposition metadata from the communities endpoint.
See https://developers.zenodo.org/#representation.
"""

created: datetime = None
modified: datetime = None
recid: str
conceptrecid: str
doi: Doi | SandboxDoi | None = None
conceptdoi: Doi | SandboxDoi | None = None
doi_url: str
title: str
updated: datetime = None
stats: ZenodoStats

@classmethod
def check_empty_string(cls, doi: str): # noqa: N805
"""Sometimes zenodo returns an empty string for the `doi`. Convert to None."""
if doi == "":
return


def get_zenodo_logs() -> pd.DataFrame():
"""Get all metrics for all versions of records in the Catalyst Cooperative Zenodo community."""
community_url = "https://zenodo.org/api/communities/14454015-63f1-4f05-80fd-1a9b07593c9e/records"
# First, get metadata on all the datasets in the Catalyst Cooperative community
community_records = requests.get(community_url, timeout=100)
dataset_records = community_records.json()["hits"]["hits"]
dataset_records = [CommunityMetadata(**record) for record in dataset_records]
stats_dfs = []

for record in dataset_records:
logger.info(f"Getting usage metrics for {record.title}")
# For each dataset in the community, get all archived versions and their
# corresponding metrics.
versions_url = f"https://zenodo.org/api/records/{record.recid}/versions"
record_versions = requests.get(versions_url, timeout=100)
version_records = record_versions.json()["hits"]["hits"]
version_records = [
CommunityMetadata(**version_record) for version_record in version_records
]
version_df = pd.DataFrame(
[
dict(
version_record.stats.dict(),
doi=version_record.doi,
title=version_record.title,
)
for version_record in version_records
]
)
if not version_df.empty:
stats_dfs.append(version_df)
return pd.concat(stats_dfs)


def upload_to_bucket(data: pd.DataFrame) -> None:
"""Upload a gcp object."""
bucket_name = "pudl-usage-metrics-archives.catalyst.coop"
client = storage.Client()
bucket = client.get_bucket(bucket_name)

blob = bucket.blob(f"zenodo/{date.today().strftime('%Y-%m-%d')}.csv")
blob.upload_from_string(data.to_csv(index=False), "text/csv")

logger.info("Uploaded data to GCS bucket.")


def save_metrics():
"""Save Zenodo traffic metrics to google cloud bucket."""
metrics_df = get_zenodo_logs()
upload_to_bucket(metrics_df)


if __name__ == "__main__":
sys.exit(save_metrics())

0 comments on commit 653ae7a

Please sign in to comment.