From ccbef1ebce27664d12962103e1f5092cf4530bbe Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 12:27:13 +0000 Subject: [PATCH 01/13] =?UTF-8?q?=F0=9F=93=8A=20test=20autoupdate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From da530f27f67af4d5187299fcaba9a8141d949fdf Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 12:49:05 +0000 Subject: [PATCH 02/13] test --- snapshots/cdc/latest/measles_cases.json.dvc | 2 +- snapshots/cdc/latest/measles_cases.py | 1 - snapshots/cdc/latest/test.json.dvc | 18 +++++++++ snapshots/cdc/latest/test.py | 45 +++++++++++++++++++++ 4 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 snapshots/cdc/latest/test.json.dvc create mode 100644 snapshots/cdc/latest/test.py diff --git a/snapshots/cdc/latest/measles_cases.json.dvc b/snapshots/cdc/latest/measles_cases.json.dvc index 0ef0c33d232..cc726d2896f 100644 --- a/snapshots/cdc/latest/measles_cases.json.dvc +++ b/snapshots/cdc/latest/measles_cases.json.dvc @@ -8,7 +8,7 @@ meta: url_main: https://www.cdc.gov/measles/data-research/ url_download: https://www.cdc.gov/wcms/vizdata/measles/MeaslesCasesYear.json date_accessed: 2025-03-10 - date_published: '2025-03-06' + date_published: '2025-01-01' license: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html diff --git a/snapshots/cdc/latest/measles_cases.py b/snapshots/cdc/latest/measles_cases.py index 9177a70ae41..64787c92832 100644 --- a/snapshots/cdc/latest/measles_cases.py +++ b/snapshots/cdc/latest/measles_cases.py @@ -21,7 +21,6 @@ def main(upload: bool) -> None: snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/measles_cases.json") date = get_date_of_update() # Download data from source, add file to DVC and upload to S3. - # snap.create_snapshot(upload=upload) snap = modify_metadata(snap, date) # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. # snap.dvc_add(upload=upload) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc new file mode 100644 index 00000000000..b0fb8120c28 --- /dev/null +++ b/snapshots/cdc/latest/test.json.dvc @@ -0,0 +1,18 @@ +meta: + origin: + producer: TEST + title: TEST + description: Annual measles cases as reported by the Centers for Disease Control and Prevention (CDC). + citation_full: Measles Cases and Outbreaks (2025). Centers for Disease Control and Prevention (CDC). + attribution_short: CDC + url_main: https://www.cdc.gov/measles/data-research/ + url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases2.csv + date_accessed: 2025-03-10 + date_published: '2024-03-02' + license: + name: Public domain + url: https://www.cdc.gov/other/agencymaterials.html +outs: + - md5: 010745ab91b7dcd112c275ef6d5e30ec + size: 42 + path: test.json diff --git a/snapshots/cdc/latest/test.py b/snapshots/cdc/latest/test.py new file mode 100644 index 00000000000..43e670fe56a --- /dev/null +++ b/snapshots/cdc/latest/test.py @@ -0,0 +1,45 @@ +"""Script to create a snapshot of dataset.""" + +import datetime as dt +from pathlib import Path + +import click +import pandas as pd + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/test.json") + date = get_date_of_update() + # Download data from source, add file to DVC and upload to S3. + snap = modify_metadata(snap, date) + # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. + # snap.dvc_add(upload=upload) + snap.create_snapshot(upload=upload) + + +def modify_metadata(snap: Snapshot, date: str) -> Snapshot: + snap.metadata.origin.date_published = date # type: ignore + snap.metadata.origin.date_accessed = dt.date.today() # type: ignore + snap.metadata.save() + return snap + + +def get_date_of_update() -> str: + """ + Get the date of the latest update for yearly measles from the CDC website - https://www.cdc.gov/measles/data-research/ + """ + df = pd.read_csv("https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases2.csv") + date = df["date"][0] + return date + + +if __name__ == "__main__": + main() From 9a2d6b9130fe3db65985f4f68c3a5f366c8b51cb Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:26:49 +0000 Subject: [PATCH 03/13] diff --- snapshots/cdc/latest/test.json.dvc | 8 ++++---- snapshots/cdc/latest/test.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index b0fb8120c28..f9df3f8aea0 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -6,13 +6,13 @@ meta: citation_full: Measles Cases and Outbreaks (2025). Centers for Disease Control and Prevention (CDC). attribution_short: CDC url_main: https://www.cdc.gov/measles/data-research/ - url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases2.csv + url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv date_accessed: 2025-03-10 - date_published: '2024-03-02' + date_published: '2025-12-12' license: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html outs: - - md5: 010745ab91b7dcd112c275ef6d5e30ec - size: 42 + - md5: 8475640a6a58a95d297de1c52c060471 + size: 58 path: test.json diff --git a/snapshots/cdc/latest/test.py b/snapshots/cdc/latest/test.py index 43e670fe56a..55ebb668bf8 100644 --- a/snapshots/cdc/latest/test.py +++ b/snapshots/cdc/latest/test.py @@ -22,7 +22,7 @@ def main(upload: bool) -> None: snap = modify_metadata(snap, date) # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. # snap.dvc_add(upload=upload) - snap.create_snapshot(upload=upload) + snap.dvc_add(upload=upload) def modify_metadata(snap: Snapshot, date: str) -> Snapshot: @@ -36,7 +36,7 @@ def get_date_of_update() -> str: """ Get the date of the latest update for yearly measles from the CDC website - https://www.cdc.gov/measles/data-research/ """ - df = pd.read_csv("https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases2.csv") + df = pd.read_csv("https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv") date = df["date"][0] return date From b36ba0eb2a93ffd1b6d772700fb2964cacb55ab3 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:28:03 +0000 Subject: [PATCH 04/13] dvc_add --- snapshots/cdc/latest/test.json.dvc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index f9df3f8aea0..5c4879e1610 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -8,7 +8,7 @@ meta: url_main: https://www.cdc.gov/measles/data-research/ url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv date_accessed: 2025-03-10 - date_published: '2025-12-12' + date_published: '2026-12-25' license: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html From 8b31069287ab33c2e7eb43feb86645a71bb30aff Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:29:33 +0000 Subject: [PATCH 05/13] create_snapshot --- snapshots/cdc/latest/test.json.dvc | 4 ++-- snapshots/cdc/latest/test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index 5c4879e1610..8b87b4eaca8 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -13,6 +13,6 @@ meta: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html outs: - - md5: 8475640a6a58a95d297de1c52c060471 - size: 58 + - md5: 757f4a09d8fcdda4f0d2708d697a7d25 + size: 80 path: test.json diff --git a/snapshots/cdc/latest/test.py b/snapshots/cdc/latest/test.py index 55ebb668bf8..8c8145fd813 100644 --- a/snapshots/cdc/latest/test.py +++ b/snapshots/cdc/latest/test.py @@ -22,7 +22,7 @@ def main(upload: bool) -> None: snap = modify_metadata(snap, date) # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. # snap.dvc_add(upload=upload) - snap.dvc_add(upload=upload) + snap.create_snapshot(upload=upload) def modify_metadata(snap: Snapshot, date: str) -> Snapshot: From 0fa4a9ee439a8b92157c382fb6281dd96f0379c0 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:33:37 +0000 Subject: [PATCH 06/13] create_snapshot --- snapshots/cdc/latest/test.json.dvc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index 8b87b4eaca8..fb9a7d0a7af 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -13,6 +13,6 @@ meta: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html outs: - - md5: 757f4a09d8fcdda4f0d2708d697a7d25 - size: 80 + - md5: 98c890d6cd54eef73ab01f27a02f27a1 + size: 102 path: test.json From 6f87f3aed96ecc0a63324c119dbab9d39d0e5b18 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:38:05 +0000 Subject: [PATCH 07/13] add both dvc_add and snapshot --- snapshots/cdc/latest/measles_cases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snapshots/cdc/latest/measles_cases.py b/snapshots/cdc/latest/measles_cases.py index 64787c92832..807306b0e3c 100644 --- a/snapshots/cdc/latest/measles_cases.py +++ b/snapshots/cdc/latest/measles_cases.py @@ -23,7 +23,7 @@ def main(upload: bool) -> None: # Download data from source, add file to DVC and upload to S3. snap = modify_metadata(snap, date) # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. - # snap.dvc_add(upload=upload) + snap.dvc_add(upload=upload) snap.create_snapshot(upload=upload) From bbecd193d16495b52203a5746e97e635eeba860e Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:38:45 +0000 Subject: [PATCH 08/13] dvc_add and create_snapshot, both metadata and data updated --- snapshots/cdc/latest/test.json.dvc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index fb9a7d0a7af..206dc271f54 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -8,11 +8,11 @@ meta: url_main: https://www.cdc.gov/measles/data-research/ url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv date_accessed: 2025-03-10 - date_published: '2026-12-25' + date_published: '2029-12-25' license: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html outs: - - md5: 98c890d6cd54eef73ab01f27a02f27a1 - size: 102 + - md5: 97a9c9d445e4a3ff0e172011f8a80ea1 + size: 127 path: test.json From 8156043becb2ba7b70cc08702c3589789e307cfa Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:39:19 +0000 Subject: [PATCH 09/13] try just create_snapshot --- snapshots/cdc/latest/measles_cases.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/snapshots/cdc/latest/measles_cases.py b/snapshots/cdc/latest/measles_cases.py index 807306b0e3c..64787c92832 100644 --- a/snapshots/cdc/latest/measles_cases.py +++ b/snapshots/cdc/latest/measles_cases.py @@ -23,7 +23,7 @@ def main(upload: bool) -> None: # Download data from source, add file to DVC and upload to S3. snap = modify_metadata(snap, date) # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. - snap.dvc_add(upload=upload) + # snap.dvc_add(upload=upload) snap.create_snapshot(upload=upload) From 16fa4b72977f624d65bc8c7714fa95a06fdfb943 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:43:25 +0000 Subject: [PATCH 10/13] data changes with create_snapshot --- snapshots/cdc/latest/test.json.dvc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index 206dc271f54..966f4e7a309 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -13,6 +13,6 @@ meta: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html outs: - - md5: 97a9c9d445e4a3ff0e172011f8a80ea1 - size: 127 + - md5: b96f8c9e06ac14c560011568ee3e6ed7 + size: 150 path: test.json From eaef033d3ba493e92683d237825d3e46f282b85e Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:43:54 +0000 Subject: [PATCH 11/13] try dvc_add --- snapshots/cdc/latest/measles_cases.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshots/cdc/latest/measles_cases.py b/snapshots/cdc/latest/measles_cases.py index 64787c92832..d55f5d35130 100644 --- a/snapshots/cdc/latest/measles_cases.py +++ b/snapshots/cdc/latest/measles_cases.py @@ -23,8 +23,8 @@ def main(upload: bool) -> None: # Download data from source, add file to DVC and upload to S3. snap = modify_metadata(snap, date) # Add the file to DVC and optionally upload it to S3, based on the `upload` parameter. - # snap.dvc_add(upload=upload) - snap.create_snapshot(upload=upload) + snap.dvc_add(upload=upload) + # snap.create_snapshot(upload=upload) def modify_metadata(snap: Snapshot, date: str) -> Snapshot: From b979b4a4817163956abd255c048622d770f5add7 Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:50:29 +0000 Subject: [PATCH 12/13] wip --- snapshots/cdc/latest/test.json.dvc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index 966f4e7a309..eb29a07512f 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -8,11 +8,11 @@ meta: url_main: https://www.cdc.gov/measles/data-research/ url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv date_accessed: 2025-03-10 - date_published: '2029-12-25' + date_published: '2031-12-25' license: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html outs: - - md5: b96f8c9e06ac14c560011568ee3e6ed7 - size: 150 + - md5: 68d152991d89c11c616a787e778ecbde + size: 171 path: test.json From ca0c78387a6151213474dbb62c5ed1da93ea7a5e Mon Sep 17 00:00:00 2001 From: spoonerf Date: Mon, 10 Mar 2025 13:55:09 +0000 Subject: [PATCH 13/13] dvc_add updates dates and metadata --- snapshots/cdc/latest/test.json.dvc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/snapshots/cdc/latest/test.json.dvc b/snapshots/cdc/latest/test.json.dvc index eb29a07512f..f4bdb19ff49 100644 --- a/snapshots/cdc/latest/test.json.dvc +++ b/snapshots/cdc/latest/test.json.dvc @@ -8,11 +8,11 @@ meta: url_main: https://www.cdc.gov/measles/data-research/ url_download: https://raw.githubusercontent.com/spoonerf/test/refs/heads/main/test_cases.csv date_accessed: 2025-03-10 - date_published: '2031-12-25' + date_published: '2032-12-25' license: name: Public domain url: https://www.cdc.gov/other/agencymaterials.html outs: - - md5: 68d152991d89c11c616a787e778ecbde - size: 171 + - md5: 98a478570ad03da7ac4f93b265742317 + size: 191 path: test.json