Skip to content

Commit

Permalink
Reorganize and prune organizational level variables
Browse files Browse the repository at this point in the history
  • Loading branch information
e-belfer committed Sep 16, 2024
1 parent 4171766 commit 125c047
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 36 deletions.
58 changes: 30 additions & 28 deletions src/usage_metrics/scripts/save_github_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import os
import sys
import time
from dataclasses import dataclass
from datetime import date

Expand All @@ -23,21 +24,7 @@ class Metric:
folder: str


TOKEN = os.getenv("API_TOKEN_GITHUB", "...")
OWNER = "catalyst-cooperative"
REPO = "pudl"
BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop"

BIWEEKLY_METRICS = [
Metric("clones", "clones"),
Metric("popular/paths", "popular_paths"),
Metric("popular/referrers", "popular_referrers"),
Metric("views", "views"),
]
PERSISTENT_METRICS = [Metric("stargazers", "stargazers"), Metric("forks", "forks")]


def get_biweekly_metrics(metric: str) -> str:
def get_biweekly_metrics(owner: str, repo: str, token: str, metric: str) -> str:
"""Get json data for a biweekly github metric.
Args:
Expand All @@ -46,17 +33,17 @@ def get_biweekly_metrics(metric: str) -> str:
Returns:
json (str): The metric data as json text.
"""
query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/traffic/{metric}"
query_url = f"https://api.github.com/repos/{owner}/{repo}/traffic/{metric}"
headers = {
"Authorization": f"token {TOKEN}",
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3+json",
}

response = make_github_request(query_url, headers)
return json.dumps(response.json())


def get_persistent_metrics(metric) -> str:
def get_persistent_metrics(owner: str, repo: str, token: str, metric: str) -> str:
"""Get githubs persistent metrics: forks and stargazers.
Args:
Expand All @@ -65,15 +52,19 @@ def get_persistent_metrics(metric) -> str:
Returns:
json (str): A json string of metrics.
"""
query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/{metric}"
query_url = f"https://api.github.com/repos/{owner}/{repo}/{metric}"
headers = {
"Authorization": f"token {TOKEN}",
"Authorization": f"token {token}",
"Accept": "application/vnd.github.v3.star+json",
}

metrics = []
page = 1
while True:

timeout = 600 # Set 10 minute timeout
timeout_start = time.time()

while time.time() < timeout_start + timeout:
params = {"page": page}
metrics_json = make_github_request(query_url, headers, params).json()

Expand Down Expand Up @@ -103,15 +94,14 @@ def make_github_request(query: str, headers: str, params: str = None):
raise HTTPError(
f"HTTP error occurred: {http_err}\n\tResponse test: {response.text}"
)
except Exception as err:
raise Exception(f"Other error occurred: {err}")
return response


def upload_to_bucket(data, metric):
"""Upload a gcp object."""
bucket_name = "pudl-usage-metrics-archives.catalyst.coop"
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
bucket = storage_client.bucket(bucket_name)
blob_name = f"github/{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json"

blob = bucket.blob(blob_name)
Expand All @@ -122,12 +112,24 @@ def upload_to_bucket(data, metric):

def save_metrics():
"""Save github traffic metrics to google cloud bucket."""
for metric in BIWEEKLY_METRICS:
metric_data = get_biweekly_metrics(metric.name)
token = os.getenv("API_TOKEN_GITHUB", "...")
owner = "catalyst-cooperative"
repo = "pudl"

biweekly_metrics = [
Metric("clones", "clones"),
Metric("popular/paths", "popular_paths"),
Metric("popular/referrers", "popular_referrers"),
Metric("views", "views"),
]
persistent_metrics = [Metric("stargazers", "stargazers"), Metric("forks", "forks")]

for metric in biweekly_metrics:
metric_data = get_biweekly_metrics(owner, repo, token, metric.name)
upload_to_bucket(metric_data, metric)

for metric in PERSISTENT_METRICS:
metric_data = get_persistent_metrics(metric.name)
for metric in persistent_metrics:
metric_data = get_persistent_metrics(owner, repo, token, metric.name)
upload_to_bucket(metric_data, metric)


Expand Down
14 changes: 6 additions & 8 deletions src/usage_metrics/scripts/save_kaggle_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,27 @@
from google.cloud import storage
from kaggle.api.kaggle_api_extended import KaggleApi

KAGGLE_OWNER = "catalystcooperative"
KAGGLE_DATASET = "pudl-project"
OWNER = "catalyst-cooperative"
REPO = "pudl"
BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop"

logger = logging.getLogger()
logging.basicConfig(level="INFO")


def get_kaggle_logs() -> str:
"""Get PUDL project usage metadata from Kaggle site."""
api = KaggleApi()
kaggle_owner = "catalystcooperative"
kaggle_dataset = "pudl-project"

metadata = api.metadata_get(KAGGLE_OWNER, KAGGLE_DATASET)
metadata = api.metadata_get(kaggle_owner, kaggle_dataset)
metadata.update({"metrics_date": date.today().strftime("%Y-%m-%d")})
return json.dumps(metadata)


def upload_to_bucket(data):
"""Upload a gcp object."""
bucket_name = "pudl-usage-metrics-archives.catalyst.coop"

storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
bucket = storage_client.bucket(bucket_name)
blob_name = f"kaggle/{date.today().strftime('%Y-%m-%d')}.json"

blob = bucket.blob(blob_name)
Expand Down

0 comments on commit 125c047

Please sign in to comment.