-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #140 from catalyst-cooperative/s3-pipeline
Create a Dagster ETL pipeline for S3 usage metrics
- Loading branch information
Showing
25 changed files
with
683 additions
and
107 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c07231ee-a317-405b-9aec-56d5131ffb0d", | ||
"metadata": {}, | ||
"source": [ | ||
"# Inspecting dagster assets\n", | ||
"This notebooks allows you to inspect dagster asset values. **This is just a template notebook. Do your asset explorations in a copy of this notebook.** \n", | ||
"\n", | ||
"Some assets are written to the database in which case you can just pull the tables into pandas or explore them in the database. However, many assets use the default IO Manager which writes asset values to the `$DAGSTER_HOME/storage/` directory as pickle files. Dagster provides a method for inspecting asset values no matter what IO Manager the asset uses." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "de97d7ba-22f7-433e-9f2f-0b9df8b64fc7", | ||
"metadata": { | ||
"tags": [] | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import os\n", | ||
"\n", | ||
"assert os.environ.get(\"DAGSTER_HOME\"), (\n", | ||
" \"The DAGSTER_HOME env var is not set so dagster won't be able to find the assets.\"\n", | ||
" \"Set the DAGSTER_HOME env var in this notebook or kill the jupyter server and set\"\n", | ||
" \" the DAGSTER_HOME env var in your terminal and relaunch jupyter.\"\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "c54503cc-19a2-4cd0-8724-f371eebf54e4", | ||
"metadata": {}, | ||
"source": [ | ||
"## Inspect an asset that uses Dagster's default IO manager" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "aa537769", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from dagster import AssetKey\n", | ||
"\n", | ||
"from usage_metrics.etl import defs\n", | ||
"\n", | ||
"asset_key = \"transform_s3_logs\"\n", | ||
"partition_key = \"2024-07-21\"\n", | ||
"\n", | ||
"with defs.get_asset_value_loader() as loader:\n", | ||
" df = loader.load_asset_value(asset_key, partition_key = partition_key)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.12.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,9 @@ | ||
"""Module containing dagster tools for cleaning PUDL usage metrics.""" | ||
|
||
from usage_metrics.repository import datasette_logs, intake_logs # noqa: F401 | ||
from usage_metrics.repository import datasette_logs, intake_logs | ||
|
||
from . import ( | ||
core, | ||
out, | ||
raw, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
"""Module contains assets that transform data into core assets.""" | ||
|
||
from . import s3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
"""Transform data from S3 logs.""" | ||
|
||
import pandas as pd | ||
from dagster import ( | ||
AssetExecutionContext, | ||
WeeklyPartitionsDefinition, | ||
asset, | ||
) | ||
|
||
from usage_metrics.helpers import geocode_ips | ||
|
||
|
||
@asset( | ||
partitions_def=WeeklyPartitionsDefinition(start_date="2023-08-16"), | ||
io_manager_key="database_manager", | ||
tags={"source": "s3"}, | ||
) | ||
def core_s3_logs( | ||
context: AssetExecutionContext, | ||
raw_s3_logs: pd.DataFrame, | ||
) -> pd.DataFrame: | ||
"""Transform daily S3 logs. | ||
Add column headers, geocode values, | ||
""" | ||
# Name columns | ||
raw_s3_logs.columns = [ | ||
"bucket_owner", | ||
"bucket", | ||
"time", | ||
"timezone", | ||
"remote_ip", | ||
"requester", | ||
"request_id", | ||
"operation", | ||
"key", | ||
"request_uri", | ||
"http_status", | ||
"error_code", | ||
"bytes_sent", | ||
"object_size", | ||
"total_time", | ||
"turn_around_time", | ||
"referer", | ||
"user_agent", | ||
"version_id", | ||
"host_id", | ||
"signature_version", | ||
"cipher_suite", | ||
"authentication_type", | ||
"host_header", | ||
"tls_version", | ||
"access_point_arn", | ||
"acl_required", | ||
] | ||
|
||
# Drop entirely duplicate rows | ||
raw_s3_logs = raw_s3_logs.drop_duplicates() | ||
|
||
# Combine time and timezone columns | ||
raw_s3_logs.time = raw_s3_logs.time + " " + raw_s3_logs.timezone | ||
raw_s3_logs = raw_s3_logs.drop(columns=["timezone"]) | ||
|
||
# Drop S3 lifecycle transitions | ||
raw_s3_logs = raw_s3_logs.loc[raw_s3_logs.operation != "S3.TRANSITION_INT.OBJECT"] | ||
|
||
# Geocode IPS | ||
raw_s3_logs["remote_ip"] = raw_s3_logs["remote_ip"].mask( | ||
raw_s3_logs["remote_ip"].eq("-"), pd.NA | ||
) # Mask null IPs | ||
geocoded_df = geocode_ips(raw_s3_logs) | ||
|
||
# Convert string to datetime using Pandas | ||
format_string = "[%d/%b/%Y:%H:%M:%S %z]" | ||
geocoded_df["time"] = pd.to_datetime(geocoded_df.time, format=format_string) | ||
|
||
geocoded_df["bytes_sent"] = geocoded_df["bytes_sent"].mask( | ||
geocoded_df["bytes_sent"].eq("-"), 0 | ||
) | ||
numeric_fields = [ | ||
"bytes_sent", | ||
"http_status", | ||
"object_size", | ||
"total_time", | ||
"turn_around_time", | ||
] | ||
for field in numeric_fields: | ||
geocoded_df[field] = pd.to_numeric(geocoded_df[field], errors="coerce") | ||
|
||
geocoded_df = geocoded_df.set_index("request_id") | ||
assert geocoded_df.index.is_unique | ||
|
||
# Drop unnecessary geocoding columns | ||
geocoded_df = geocoded_df.drop( | ||
columns=[ | ||
"remote_ip_country_flag", | ||
"remote_ip_country_flag_url", | ||
"remote_ip_country_currency", | ||
"remote_ip_continent", | ||
"remote_ip_isEU", | ||
] | ||
) | ||
|
||
return geocoded_df.reset_index() |
Oops, something went wrong.