-
-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Archive raw Kaggle and Github metrics daily #162
Changes from 10 commits
73c326f
d521c07
c0063b7
3240c99
7b7b1b8
6da16ca
0824f18
8caf839
d07a406
4171766
125c047
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
name: save-daily-metrics | ||
on: | ||
workflow_dispatch: | ||
schedule: | ||
# Run every day at 8:00 PM UTC | ||
# https://crontab.guru/#0_20_*_*_* | ||
- cron: "0 20 * * *" | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
permissions: | ||
contents: read | ||
id-token: write | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 2 | ||
|
||
- name: Set up conda environment for testing | ||
uses: mamba-org/setup-micromamba@v1 | ||
with: | ||
environment-file: environment.yml | ||
cache-environment: true | ||
ondarc: | | ||
channels: | ||
- conda-forge | ||
- defaults | ||
channel_priority: strict | ||
|
||
- name: Log conda environnment information | ||
run: | | ||
conda info | ||
conda list | ||
conda config --show-sources | ||
conda config --show | ||
printenv | sort | ||
|
||
- name: Authenticate gcloud | ||
id: gcloud-auth | ||
continue-on-error: true | ||
uses: "google-github-actions/auth@v2" | ||
with: | ||
workload_identity_provider: "projects/345950277072/locations/global/workloadIdentityPools/gh-actions-pool/providers/gh-actions-provider" | ||
service_account: "pudl-usage-metrics-etl@catalyst-cooperative-pudl.iam.gserviceaccount.com" | ||
create_credentials_file: true | ||
|
||
- name: "Set up Cloud SDK" | ||
uses: "google-github-actions/setup-gcloud@v2" | ||
with: | ||
version: ">= 363.0.0" | ||
|
||
- shell: bash -l {0} | ||
name: Save Github Metrics | ||
env: | ||
API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} | ||
run: | | ||
python src/usage_metrics/scripts/save_github_metrics.py | ||
|
||
- shell: bash -l {0} | ||
name: Save Kaggle Metrics | ||
env: | ||
KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} | ||
KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} | ||
run: | | ||
python src/usage_metrics/scripts/save_kaggle_metrics.py | ||
|
||
- name: Inform the Codemonkeys | ||
uses: 8398a7/action-slack@v3 | ||
with: | ||
status: custom | ||
fields: workflow,job,commit,repo,ref,author,took | ||
custom_payload: | | ||
{ | ||
username: 'action-slack', | ||
icon_emoji: ':octocat:', | ||
attachments: [{ | ||
color: '${{ job.status }}' === 'success' ? 'good' : '${{ job.status }}' === 'failure' ? 'danger' : 'warning', | ||
text: `${process.env.AS_WORKFLOW}\n${process.env.AS_JOB} (${process.env.AS_COMMIT}) of ${process.env.AS_REPO}@${process.env.AS_REF} by ${process.env.AS_AUTHOR} ${{ job.status }} in ${process.env.AS_TOOK}`, | ||
}] | ||
} | ||
env: | ||
GITHUB_TOKEN: ${{ github.token }} # required | ||
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} # required | ||
MATRIX_CONTEXT: ${{ toJson(matrix) }} # required | ||
if: always() # Pick up events even if the job fails or is canceled. |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
"""Module contains assets that extract raw data.""" | ||
|
||
from . import save_github_metrics, save_kaggle_metrics |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
"""This script pull github traffic metrics and saves them to a GC Bucket.""" | ||
|
||
import json | ||
import logging | ||
import os | ||
import sys | ||
from dataclasses import dataclass | ||
from datetime import date | ||
|
||
import requests | ||
from google.cloud import storage | ||
from requests.exceptions import HTTPError | ||
|
||
logger = logging.getLogger() | ||
logging.basicConfig(level="INFO") | ||
|
||
|
||
@dataclass | ||
class Metric: | ||
"""Format metrics into folder names.""" | ||
|
||
name: str | ||
folder: str | ||
|
||
|
||
TOKEN = os.getenv("API_TOKEN_GITHUB", "...") | ||
OWNER = "catalyst-cooperative" | ||
REPO = "pudl" | ||
BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop" | ||
|
||
BIWEEKLY_METRICS = [ | ||
Metric("clones", "clones"), | ||
Metric("popular/paths", "popular_paths"), | ||
Metric("popular/referrers", "popular_referrers"), | ||
Metric("views", "views"), | ||
] | ||
PERSISTENT_METRICS = [Metric("stargazers", "stargazers"), Metric("forks", "forks")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think these probably don't need to be module-level variables and can instead be set within each function - the GH-auth related ones can go into |
||
|
||
|
||
def get_biweekly_metrics(metric: str) -> str: | ||
"""Get json data for a biweekly github metric. | ||
|
||
Args: | ||
metric (str): The github metric name. | ||
|
||
Returns: | ||
json (str): The metric data as json text. | ||
""" | ||
query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/traffic/{metric}" | ||
headers = { | ||
"Authorization": f"token {TOKEN}", | ||
"Accept": "application/vnd.github.v3+json", | ||
} | ||
|
||
response = make_github_request(query_url, headers) | ||
return json.dumps(response.json()) | ||
|
||
|
||
def get_persistent_metrics(metric) -> str: | ||
"""Get githubs persistent metrics: forks and stargazers. | ||
|
||
Args: | ||
metrics (str): the metric to retrieve (forks | stargazers) | ||
|
||
Returns: | ||
json (str): A json string of metrics. | ||
""" | ||
query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/{metric}" | ||
headers = { | ||
"Authorization": f"token {TOKEN}", | ||
"Accept": "application/vnd.github.v3.star+json", | ||
} | ||
|
||
metrics = [] | ||
page = 1 | ||
while True: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. non-blocking: if somehow Github gives us an infinite stream of forks and stargazers (e.g. their pagination stops working for some reason??), we wouldn't want to keep requesting forever. We could work around this by:
In any case, the worst case scenario is the GHA runner eventually timing out after 6 hours of free compute. Hence, non-blocking. |
||
params = {"page": page} | ||
metrics_json = make_github_request(query_url, headers, params).json() | ||
|
||
if len(metrics_json) <= 0: | ||
break | ||
metrics += metrics_json | ||
page += 1 | ||
return json.dumps(metrics) | ||
|
||
|
||
def make_github_request(query: str, headers: str, params: str = None): | ||
"""Makes a request to the github api. | ||
|
||
Args: | ||
query (str): A github api request url. | ||
headers (str): Header to include in the request. | ||
params (str): Params of request. | ||
|
||
Returns: | ||
response (requests.models.Response): the request response. | ||
""" | ||
try: | ||
response = requests.get(query, headers=headers, params=params, timeout=100) | ||
|
||
response.raise_for_status() | ||
except HTTPError as http_err: | ||
raise HTTPError( | ||
f"HTTP error occurred: {http_err}\n\tResponse test: {response.text}" | ||
) | ||
except Exception as err: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I would let the non- |
||
raise Exception(f"Other error occurred: {err}") | ||
return response | ||
|
||
|
||
def upload_to_bucket(data, metric): | ||
"""Upload a gcp object.""" | ||
storage_client = storage.Client() | ||
bucket = storage_client.bucket(BUCKET_NAME) | ||
blob_name = f"github/{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json" | ||
|
||
blob = bucket.blob(blob_name) | ||
blob.upload_from_string(data) | ||
|
||
logger.info(f"Uploaded {metric.name} data to {blob_name}.") | ||
|
||
|
||
def save_metrics(): | ||
"""Save github traffic metrics to google cloud bucket.""" | ||
for metric in BIWEEKLY_METRICS: | ||
metric_data = get_biweekly_metrics(metric.name) | ||
upload_to_bucket(metric_data, metric) | ||
|
||
for metric in PERSISTENT_METRICS: | ||
metric_data = get_persistent_metrics(metric.name) | ||
upload_to_bucket(metric_data, metric) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(save_metrics()) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,49 @@ | ||
"""This script pull Kaggle traffic metrics and saves them to a GC Bucket.""" | ||
|
||
import json | ||
import logging | ||
import sys | ||
from datetime import date | ||
|
||
from google.cloud import storage | ||
from kaggle.api.kaggle_api_extended import KaggleApi | ||
|
||
KAGGLE_OWNER = "catalystcooperative" | ||
KAGGLE_DATASET = "pudl-project" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think these (and |
||
OWNER = "catalyst-cooperative" | ||
REPO = "pudl" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think these are used anywhere else, should we delete them? |
||
BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop" | ||
|
||
logger = logging.getLogger() | ||
logging.basicConfig(level="INFO") | ||
|
||
|
||
def get_kaggle_logs() -> str: | ||
"""Get PUDL project usage metadata from Kaggle site.""" | ||
api = KaggleApi() | ||
|
||
metadata = api.metadata_get(KAGGLE_OWNER, KAGGLE_DATASET) | ||
metadata.update({"metrics_date": date.today().strftime("%Y-%m-%d")}) | ||
return json.dumps(metadata) | ||
|
||
|
||
def upload_to_bucket(data): | ||
"""Upload a gcp object.""" | ||
storage_client = storage.Client() | ||
bucket = storage_client.bucket(BUCKET_NAME) | ||
blob_name = f"kaggle/{date.today().strftime('%Y-%m-%d')}.json" | ||
|
||
blob = bucket.blob(blob_name) | ||
blob.upload_from_string(data) | ||
|
||
logger.info(f"Uploaded today's data to {blob_name}.") | ||
|
||
|
||
def save_metrics(): | ||
"""Save github traffic metrics to google cloud bucket.""" | ||
kaggle_metrics = get_kaggle_logs() | ||
upload_to_bucket(kaggle_metrics) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(save_metrics()) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ passenv = | |
GOOGLE_* | ||
GCLOUD_* | ||
GCP_* | ||
KAGGLE_* | ||
HOME | ||
SQLALCHEMY_WARN_20 | ||
IPINFO_TOKEN | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
non-blocking: Did you just use your own personal key/username for these? or do we have an organizational kaggle account?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have an organizational level account!