Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 test autoupdate #4093

Closed
wants to merge 13 commits into from
2 changes: 1 addition & 1 deletion snapshots/cdc/latest/measles_cases.json.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ meta:
url_main: https://www.cdc.gov/measles/data-research/
url_download: https://www.cdc.gov/wcms/vizdata/measles/MeaslesCasesYear.json
date_accessed: 2025-03-10
date_published: '2025-03-06'
date_published: '2025-01-01'
license:
name: Public domain
url: https://www.cdc.gov/other/agencymaterials.html
Expand Down
5 changes: 2 additions & 3 deletions snapshots/cdc/latest/measles_cases.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,10 @@ def main(upload: bool) -> None:
snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/measles_cases.json")
date = get_date_of_update()
# Download data from source, add file to DVC and upload to S3.
# snap.create_snapshot(upload=upload)
snap = modify_metadata(snap, date)
# Add the file to DVC and optionally upload it to S3, based on the `upload` parameter.
# snap.dvc_add(upload=upload)
snap.create_snapshot(upload=upload)
snap.dvc_add(upload=upload)
# snap.create_snapshot(upload=upload)


def modify_metadata(snap: Snapshot, date: str) -> Snapshot:
Expand Down
18 changes: 18 additions & 0 deletions snapshots/cdc/latest/test.json.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
meta:
origin:
producer: TEST
title: TEST
description: Annual measles cases as reported by the Centers for Disease Control and Prevention (CDC).
citation_full: Measles Cases and Outbreaks (2025). Centers for Disease Control and Prevention (CDC).
attribution_short: CDC
url_main: https://www.cdc.gov/measles/data-research/
url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv
date_accessed: 2025-03-10
date_published: '2032-12-25'
license:
name: Public domain
url: https://www.cdc.gov/other/agencymaterials.html
outs:
- md5: 98a478570ad03da7ac4f93b265742317
size: 191
path: test.json
45 changes: 45 additions & 0 deletions snapshots/cdc/latest/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
"""Script to create a snapshot of dataset."""

import datetime as dt
from pathlib import Path

import click
import pandas as pd

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/test.json")
date = get_date_of_update()
# Download data from source, add file to DVC and upload to S3.
snap = modify_metadata(snap, date)
# Add the file to DVC and optionally upload it to S3, based on the `upload` parameter.
# snap.dvc_add(upload=upload)
snap.create_snapshot(upload=upload)


def modify_metadata(snap: Snapshot, date: str) -> Snapshot:
snap.metadata.origin.date_published = date # type: ignore
snap.metadata.origin.date_accessed = dt.date.today() # type: ignore
snap.metadata.save()
return snap


def get_date_of_update() -> str:
"""
Get the date of the latest update for yearly measles from the CDC website - https://www.cdc.gov/measles/data-research/
"""
df = pd.read_csv("https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv")
date = df["date"][0]
return date


if __name__ == "__main__":
main()