📊 pertussis deaths (#4077)

* 📊 pertussis deaths * adding pertussis deaths and fixing diphtheria deaths link * adding garden steps * update snapshot
owid · Mar 6, 2025 · a114e05 · a114e05
1 parent 4f3cf48
commit a114e05
Show file tree

Hide file tree

Showing 14 changed files with 426 additions and 8 deletions.
diff --git a/dag/health.yml b/dag/health.yml
@@ -1151,3 +1151,21 @@ steps:
     - data://garden/demography/2024-07-15/population
   data://grapher/cdc/2025-03-04/pertussis_cases:
     - data://garden/cdc/2025-03-04/pertussis_cases
+
+  #
+  # TODO: add step name (just something recognizable)
+  #
+  data://meadow/us_census_bureau/2025-03-06/pertussis_deaths:
+    - snapshot://us_census_bureau/2025-03-06/pertussis_deaths.csv
+  data://garden/us_census_bureau/2025-03-06/pertussis_deaths:
+    - data://meadow/us_census_bureau/2025-03-06/pertussis_deaths
+
+  #
+  # TODO: add step name (just something recognizable)
+  #
+  data://garden/health/2025-03-06/pertussis_deaths:
+    - data://garden/us_census_bureau/2025-03-06/pertussis_deaths
+    - data://garden/who/2025-01-17/mortality_database_vaccine_preventable
+    - data://garden/demography/2024-07-15/population
+  data://grapher/health/2025-03-06/pertussis_deaths:
+    - data://garden/health/2025-03-06/pertussis_deaths
diff --git a/etl/steps/data/garden/health/2025-03-04/diphtheria_deaths.py b/etl/steps/data/garden/health/2025-03-04/diphtheria_deaths.py
@@ -22,7 +22,7 @@ def run(dest_dir: str) -> None:
     tb_phr = ds_meadow_phr.read("diphtheria_deaths")
     tb_census = ds_meadow_census.read("diphtheria_deaths")
     tb_who = ds_meadow_who.read("mortality_database_vaccine_preventable", reset_metadata="keep_origins")
-    tb_who = clean_who_mortality_data(tb_who)
+    tb_who = clean_who_mortality_data(tb_who, cause="Diphtheria")
     tb_pop = ds_population.read("population", reset_metadata="keep_origins")
 
     # Process data.
@@ -50,13 +50,14 @@ def run(dest_dir: str) -> None:
     ds_garden.save()
 
 
-def clean_who_mortality_data(tb: Table) -> Table:
+def clean_who_mortality_data(tb: Table, cause: str) -> Table:
     tb = tb[
-        (tb["cause"] == "Diphtheria")
+        (tb["cause"] == cause)
         & (tb["age_group"] == "all ages")
         & (tb["country"] == "United States")
         & (tb["sex"] == "Both sexes")
     ]  # type: ignore
+    assert tb.shape[0] > 1
     tb = tb.drop(
         columns=[
             "sex",
@@ -68,6 +69,7 @@ def clean_who_mortality_data(tb: Table) -> Table:
             "death_rate_per_100_000_population",
         ]
     )
+
     tb = tb.rename(columns={"number": "deaths"})
 
     return tb
diff --git a/etl/steps/data/garden/health/2025-03-06/pertussis_deaths.meta.yml b/etl/steps/data/garden/health/2025-03-06/pertussis_deaths.meta.yml
@@ -0,0 +1,27 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Vaccination
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+
+tables:
+  pertussis_deaths:
+    variables:
+      deaths:
+        title: Pertussis deaths
+        unit: deaths
+        description_short: Reported number of [pertussis](#dod:pertussis) deaths in the United States.
+        display:
+          numDecimalPlaces: 0
+      death_rate:
+        title: Pertussis death rate
+        unit: deaths per million people
+        description_short: Reported number of [pertussis](#dod:pertussis) deaths in the United States per million people.
+        display:
+          numDecimalPlaces: 1
diff --git a/etl/steps/data/garden/health/2025-03-06/pertussis_deaths.py b/etl/steps/data/garden/health/2025-03-06/pertussis_deaths.py
@@ -0,0 +1,70 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from owid.catalog import Table
+from owid.catalog import processing as pr
+
+from etl.helpers import PathFinder
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run() -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("pertussis_deaths", namespace="us_census_bureau")
+    ds_who = paths.load_dataset("mortality_database_vaccine_preventable")
+    ds_population = paths.load_dataset("population")
+    # Read table from meadow dataset.
+    tb = ds_meadow.read("pertussis_deaths")
+    tb_who = ds_who.read("mortality_database_vaccine_preventable", reset_metadata="keep_origins")
+
+    tb_who = clean_who_mortality_data(tb_who, cause="Pertussis")
+    tb_pop = ds_population.read("population", reset_metadata="keep_origins")
+
+    tb = pr.concat([tb, tb_who], short_name="pertussis_deaths", ignore_index=True)
+
+    tb = pr.merge(
+        tb,
+        tb_pop,
+        on=["country", "year"],
+        how="left",
+    )
+    tb["death_rate"] = tb["deaths"] / tb["population"] * 1000000
+    tb = tb.drop(columns=["population", "source_x", "source_y", "world_pop_share"])
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Initialize a new garden dataset.
+    ds_garden = paths.create_dataset(tables=[tb], default_metadata=ds_meadow.metadata)
+
+    # Save garden dataset.
+    ds_garden.save()
+
+
+def clean_who_mortality_data(tb: Table, cause: str) -> Table:
+    tb = tb[
+        (tb["cause"] == cause)
+        & (tb["age_group"] == "all ages")
+        & (tb["country"] == "United States")
+        & (tb["sex"] == "Both sexes")
+    ]  # type: ignore
+    assert tb.shape[0] > 1
+    tb = tb.drop(
+        columns=[
+            "sex",
+            "age_group",
+            "cause",
+            "icd10_codes",
+            "percentage_of_cause_specific_deaths_out_of_total_deaths",
+            "age_standardized_death_rate_per_100_000_standard_population",
+            "death_rate_per_100_000_population",
+        ]
+    )
+    tb = tb.rename(columns={"number": "deaths"})
+
+    return tb
diff --git a/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.countries.json b/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.countries.json
@@ -0,0 +1,3 @@
+{
+  "United States": "United States"
+}
diff --git a/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.excluded_countries.json b/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.excluded_countries.json
@@ -0,0 +1,2 @@
+[
+]
diff --git a/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.meta.yml b/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.meta.yml
@@ -0,0 +1,58 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Vaccination
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+
+
+tables:
+  pertussis_deaths:
+    variables:
+      # testing_variable:
+      #   title: Testing variable title
+      #   unit: arbitrary units
+      #   short_unit: au
+      #   description_short: Short description of testing variable.
+      #   description_processing: Description of processing of testing variable.
+      #   description_key: List of key points about the indicator.
+      #   description_from_producer: Description of testing variable from producer.
+      #   processing_level: minor
+      #   type:
+      #   sort:
+      #   presentation:
+      #     attribution:
+      #     attribution_short:
+      #     faqs:
+      #     grapher_config:
+      #     title_public:
+      #     title_variant:
+      #     topic_tags:
+      #   display:
+      #     name: Testing variable
+      #     numDecimalPlaces: 0
+      #     tolerance: 0
+      #     color:
+      #     conversionFactor: 1
+      #     description:
+      #     entityAnnotationsMap: Test annotation
+      #     includeInTable:
+      #     isProjection: false
+      #     unit: arbitrary units
+      #     shortUnit: au
+      #     tableDisplay:
+      #       hideAbsoluteChange:
+      #       hideRelativeChange:
+      #     yearIsDay: false
+      #     zeroDay:
+      #     roundingMode:
+      #     numSignificantFigures:
+      #
+      {}
+
diff --git a/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.py b/etl/steps/data/garden/us_census_bureau/2025-03-06/pertussis_deaths.py
@@ -0,0 +1,38 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+from etl.data_helpers import geo
+from etl.helpers import PathFinder
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run() -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_meadow = paths.load_dataset("pertussis_deaths")
+
+    # Read table from meadow dataset.
+    tb = ds_meadow.read("pertussis_deaths")
+
+    #
+    # Process data.
+    #
+    # Harmonize country names.
+    tb = geo.harmonize_countries(
+        df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
+    )
+
+    # Improve table format.
+    tb = tb.format(["country", "year"])
+
+    #
+    # Save outputs.
+    #
+    # Initialize a new garden dataset.
+    ds_garden = paths.create_dataset(tables=[tb], default_metadata=ds_meadow.metadata)
+
+    # Save garden dataset.
+    ds_garden.save()
diff --git a/etl/steps/data/grapher/health/2025-03-06/pertussis_deaths.py b/etl/steps/data/grapher/health/2025-03-06/pertussis_deaths.py
@@ -0,0 +1,26 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run() -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("pertussis_deaths")
+
+    # Read table from garden dataset.
+    tb = ds_garden.read("pertussis_deaths", reset_index=False)
+
+    #
+    # Save outputs.
+    #
+    # Initialize a new grapher dataset.
+    ds_grapher = paths.create_dataset(tables=[tb], default_metadata=ds_garden.metadata)
+
+    # Save grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/us_census_bureau/2025-03-06/pertussis_deaths.py b/etl/steps/data/meadow/us_census_bureau/2025-03-06/pertussis_deaths.py
@@ -0,0 +1,32 @@
+"""Load a snapshot and create a meadow dataset."""
+
+from etl.helpers import PathFinder
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run() -> None:
+    #
+    # Load inputs.
+    #
+    # Retrieve snapshot.
+    snap = paths.load_snapshot("pertussis_deaths.csv")
+
+    # Load data from snapshot.
+    tb = snap.read()
+
+    #
+    # Process data.
+    #
+    # Improve tables format.
+    tables = [tb.format(["country", "year"])]
+
+    #
+    # Save outputs.
+    #
+    # Initialize a new meadow dataset.
+    ds_meadow = paths.create_dataset(tables=tables, default_metadata=snap.metadata)
+
+    # Save meadow dataset.
+    ds_meadow.save()
diff --git a/etl/steps/export/multidim/health/latest/vaccination_coverage.py b/etl/steps/export/multidim/health/latest/vaccination_coverage.py
@@ -0,0 +1,56 @@
+from etl.collections import multidim
+
+# from etl.db import get_engine
+from etl.helpers import PathFinder
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+# etlr multidim
+def run(dest_dir: str) -> None:
+    # engine = get_engine()
+    # Load configuration from adjacent yaml file.
+    config = paths.load_mdim_config()
+
+    # Add views for all dimensions
+    # NOTE: using load_data=False which only loads metadata significantly speeds this up
+    ds = paths.load_dataset("vaccination_coverage")
+    tb = ds.read("vaccination_coverage", load_data=False)
+
+    # 2: Bake config automatically from table
+    config_new = multidim.expand_config(tb, indicator_name="coverage", dimensions=["antigen"])
+    # 3: Combine both sources (basically dimensions and views)
+    config["dimensions"] = multidim.combine_config_dimensions(
+        config_dimensions=config_new["dimensions"],
+        config_dimensions_yaml=config.get("dimensions", {}),
+    )
+    config["views"] = config_new["views"]
+
+    # 4: Upsert to DB
+    multidim.upsert_multidim_data_page(
+        mdim_name="mdd-vaccination-who",
+        config=config,
+        paths=paths,
+    )
+
+
+#    multidim.upsert_multidim_data_page(
+#        config=config,
+#        paths=paths,
+#    )
+# config["views"] = multidim.generate_views_for_dimensions(
+#        dimensions=config["dimensions"],
+#        tables=[tb],
+#        dimensions_order_in_slug=("metric", "antigen"),
+#        warn_on_missing_combinations=False,
+#       additional_config={
+#            "$schema": "https://files.ourworldindata.org/schemas/grapher-schema.005.json",
+#            "chartTypes": ["LineChart"],
+#            "hasMapTab": True,
+#            "tab": "map",
+#            "map": {
+#                "colorScale": {"baseColorScheme": "YlGbBu"},
+#            },
+#        },
+#    )
diff --git a/snapshots/us_census_bureau/2025-02-26/diphtheria_deaths.csv.dvc b/snapshots/us_census_bureau/2025-02-26/diphtheria_deaths.csv.dvc
@@ -11,15 +11,15 @@ meta:
 
       * 1943: [US Census Bureau “Statistical Abstract of the United States: 1945”](https://www.census.gov/library/publications/1945/compendia/statab/66ed.html)
 
-      * 1944: [US Census Bureau “Statistical Abstract of the United States: 1946”](https://www.census.gov/library/publications/1934/compendia/statab/67ed.html)
+      * 1944: [US Census Bureau “Statistical Abstract of the United States: 1946”](https://www.census.gov/library/publications/1946/compendia/statab/67ed.html)
 
-      * 1945: [US Census Bureau “Statistical Abstract of the United States: 1947”](https://www.census.gov/library/publications/1935/compendia/statab/68ed.html)
+      * 1945: [US Census Bureau “Statistical Abstract of the United States: 1947”](https://www.census.gov/library/publications/1947/compendia/statab/68ed.html)
 
-      * 1946: [US Census Bureau “Statistical Abstract of the United States: 1948”](https://www.census.gov/library/publications/1936/compendia/statab/69ed.html)
+      * 1946: [US Census Bureau “Statistical Abstract of the United States: 1948”](https://www.census.gov/library/publications/1948/compendia/statab/69ed.html)
 
-      * 1947: [US Census Bureau “Statistical Abstract of the United States: 1949”](https://www.census.gov/library/publications/1938/compendia/statab/70ed.html)
+      * 1947: [US Census Bureau “Statistical Abstract of the United States: 1949”](https://www.census.gov/library/publications/1949/compendia/statab/70ed.html)
 
-      * 1948: [US Census Bureau “Statistical Abstract of the United States: 1950”](https://www.census.gov/library/publications/1939/compendia/statab/71ed.html)
+      * 1948: [US Census Bureau “Statistical Abstract of the United States: 1950”](https://www.census.gov/library/publications/1950/compendia/statab/71ed.html)
 
       * 1949: [US Census Bureau “Statistical Abstract of the United States: 1952”](https://www.census.gov/library/publications/1952/compendia/statab/73ed.html)