Skip to content

Commit

Permalink
📊 pertussis deaths (#4077)
Browse files Browse the repository at this point in the history
* 📊 pertussis deaths

* adding pertussis deaths and fixing diphtheria deaths link

* adding garden steps

* update snapshot
  • Loading branch information
spoonerf authored Mar 6, 2025
1 parent 4f3cf48 commit a114e05
Show file tree
Hide file tree
Showing 14 changed files with 426 additions and 8 deletions.
18 changes: 18 additions & 0 deletions dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1151,3 +1151,21 @@ steps:
- data://garden/demography/2024-07-15/population
data://grapher/cdc/2025-03-04/pertussis_cases:
- data://garden/cdc/2025-03-04/pertussis_cases

#
# TODO: add step name (just something recognizable)
#
data://meadow/us_census_bureau/2025-03-06/pertussis_deaths:
- snapshot://us_census_bureau/2025-03-06/pertussis_deaths.csv
data://garden/us_census_bureau/2025-03-06/pertussis_deaths:
- data://meadow/us_census_bureau/2025-03-06/pertussis_deaths

#
# TODO: add step name (just something recognizable)
#
data://garden/health/2025-03-06/pertussis_deaths:
- data://garden/us_census_bureau/2025-03-06/pertussis_deaths
- data://garden/who/2025-01-17/mortality_database_vaccine_preventable
- data://garden/demography/2024-07-15/population
data://grapher/health/2025-03-06/pertussis_deaths:
- data://garden/health/2025-03-06/pertussis_deaths
8 changes: 5 additions & 3 deletions etl/steps/data/garden/health/2025-03-04/diphtheria_deaths.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def run(dest_dir: str) -> None:
tb_phr = ds_meadow_phr.read("diphtheria_deaths")
tb_census = ds_meadow_census.read("diphtheria_deaths")
tb_who = ds_meadow_who.read("mortality_database_vaccine_preventable", reset_metadata="keep_origins")
tb_who = clean_who_mortality_data(tb_who)
tb_who = clean_who_mortality_data(tb_who, cause="Diphtheria")
tb_pop = ds_population.read("population", reset_metadata="keep_origins")

# Process data.
Expand Down Expand Up @@ -50,13 +50,14 @@ def run(dest_dir: str) -> None:
ds_garden.save()


def clean_who_mortality_data(tb: Table) -> Table:
def clean_who_mortality_data(tb: Table, cause: str) -> Table:
tb = tb[
(tb["cause"] == "Diphtheria")
(tb["cause"] == cause)
& (tb["age_group"] == "all ages")
& (tb["country"] == "United States")
& (tb["sex"] == "Both sexes")
] # type: ignore
assert tb.shape[0] > 1
tb = tb.drop(
columns=[
"sex",
Expand All @@ -68,6 +69,7 @@ def clean_who_mortality_data(tb: Table) -> Table:
"death_rate_per_100_000_population",
]
)

tb = tb.rename(columns={"number": "deaths"})

return tb
27 changes: 27 additions & 0 deletions etl/steps/data/garden/health/2025-03-06/pertussis_deaths.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Vaccination

# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365

tables:
pertussis_deaths:
variables:
deaths:
title: Pertussis deaths
unit: deaths
description_short: Reported number of [pertussis](#dod:pertussis) deaths in the United States.
display:
numDecimalPlaces: 0
death_rate:
title: Pertussis death rate
unit: deaths per million people
description_short: Reported number of [pertussis](#dod:pertussis) deaths in the United States per million people.
display:
numDecimalPlaces: 1
70 changes: 70 additions & 0 deletions etl/steps/data/garden/health/2025-03-06/pertussis_deaths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Load a meadow dataset and create a garden dataset."""

from owid.catalog import Table
from owid.catalog import processing as pr

from etl.helpers import PathFinder

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run() -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("pertussis_deaths", namespace="us_census_bureau")
ds_who = paths.load_dataset("mortality_database_vaccine_preventable")
ds_population = paths.load_dataset("population")
# Read table from meadow dataset.
tb = ds_meadow.read("pertussis_deaths")
tb_who = ds_who.read("mortality_database_vaccine_preventable", reset_metadata="keep_origins")

tb_who = clean_who_mortality_data(tb_who, cause="Pertussis")
tb_pop = ds_population.read("population", reset_metadata="keep_origins")

tb = pr.concat([tb, tb_who], short_name="pertussis_deaths", ignore_index=True)

tb = pr.merge(
tb,
tb_pop,
on=["country", "year"],
how="left",
)
tb["death_rate"] = tb["deaths"] / tb["population"] * 1000000
tb = tb.drop(columns=["population", "source_x", "source_y", "world_pop_share"])
tb = tb.format(["country", "year"])

#
# Save outputs.
#
# Initialize a new garden dataset.
ds_garden = paths.create_dataset(tables=[tb], default_metadata=ds_meadow.metadata)

# Save garden dataset.
ds_garden.save()


def clean_who_mortality_data(tb: Table, cause: str) -> Table:
tb = tb[
(tb["cause"] == cause)
& (tb["age_group"] == "all ages")
& (tb["country"] == "United States")
& (tb["sex"] == "Both sexes")
] # type: ignore
assert tb.shape[0] > 1
tb = tb.drop(
columns=[
"sex",
"age_group",
"cause",
"icd10_codes",
"percentage_of_cause_specific_deaths_out_of_total_deaths",
"age_standardized_death_rate_per_100_000_standard_population",
"death_rate_per_100_000_population",
]
)
tb = tb.rename(columns={"number": "deaths"})

return tb
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"United States": "United States"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Vaccination


# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365


tables:
pertussis_deaths:
variables:
# testing_variable:
# title: Testing variable title
# unit: arbitrary units
# short_unit: au
# description_short: Short description of testing variable.
# description_processing: Description of processing of testing variable.
# description_key: List of key points about the indicator.
# description_from_producer: Description of testing variable from producer.
# processing_level: minor
# type:
# sort:
# presentation:
# attribution:
# attribution_short:
# faqs:
# grapher_config:
# title_public:
# title_variant:
# topic_tags:
# display:
# name: Testing variable
# numDecimalPlaces: 0
# tolerance: 0
# color:
# conversionFactor: 1
# description:
# entityAnnotationsMap: Test annotation
# includeInTable:
# isProjection: false
# unit: arbitrary units
# shortUnit: au
# tableDisplay:
# hideAbsoluteChange:
# hideRelativeChange:
# yearIsDay: false
# zeroDay:
# roundingMode:
# numSignificantFigures:
#
{}

Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""Load a meadow dataset and create a garden dataset."""

from etl.data_helpers import geo
from etl.helpers import PathFinder

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run() -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("pertussis_deaths")

# Read table from meadow dataset.
tb = ds_meadow.read("pertussis_deaths")

#
# Process data.
#
# Harmonize country names.
tb = geo.harmonize_countries(
df=tb, countries_file=paths.country_mapping_path, excluded_countries_file=paths.excluded_countries_path
)

# Improve table format.
tb = tb.format(["country", "year"])

#
# Save outputs.
#
# Initialize a new garden dataset.
ds_garden = paths.create_dataset(tables=[tb], default_metadata=ds_meadow.metadata)

# Save garden dataset.
ds_garden.save()
26 changes: 26 additions & 0 deletions etl/steps/data/grapher/health/2025-03-06/pertussis_deaths.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run() -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("pertussis_deaths")

# Read table from garden dataset.
tb = ds_garden.read("pertussis_deaths", reset_index=False)

#
# Save outputs.
#
# Initialize a new grapher dataset.
ds_grapher = paths.create_dataset(tables=[tb], default_metadata=ds_garden.metadata)

# Save grapher dataset.
ds_grapher.save()
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""Load a snapshot and create a meadow dataset."""

from etl.helpers import PathFinder

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run() -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("pertussis_deaths.csv")

# Load data from snapshot.
tb = snap.read()

#
# Process data.
#
# Improve tables format.
tables = [tb.format(["country", "year"])]

#
# Save outputs.
#
# Initialize a new meadow dataset.
ds_meadow = paths.create_dataset(tables=tables, default_metadata=snap.metadata)

# Save meadow dataset.
ds_meadow.save()
56 changes: 56 additions & 0 deletions etl/steps/export/multidim/health/latest/vaccination_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from etl.collections import multidim

# from etl.db import get_engine
from etl.helpers import PathFinder

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


# etlr multidim
def run(dest_dir: str) -> None:
# engine = get_engine()
# Load configuration from adjacent yaml file.
config = paths.load_mdim_config()

# Add views for all dimensions
# NOTE: using load_data=False which only loads metadata significantly speeds this up
ds = paths.load_dataset("vaccination_coverage")
tb = ds.read("vaccination_coverage", load_data=False)

# 2: Bake config automatically from table
config_new = multidim.expand_config(tb, indicator_name="coverage", dimensions=["antigen"])
# 3: Combine both sources (basically dimensions and views)
config["dimensions"] = multidim.combine_config_dimensions(
config_dimensions=config_new["dimensions"],
config_dimensions_yaml=config.get("dimensions", {}),
)
config["views"] = config_new["views"]

# 4: Upsert to DB
multidim.upsert_multidim_data_page(
mdim_name="mdd-vaccination-who",
config=config,
paths=paths,
)


# multidim.upsert_multidim_data_page(
# config=config,
# paths=paths,
# )
# config["views"] = multidim.generate_views_for_dimensions(
# dimensions=config["dimensions"],
# tables=[tb],
# dimensions_order_in_slug=("metric", "antigen"),
# warn_on_missing_combinations=False,
# additional_config={
# "$schema": "https://files.ourworldindata.org/schemas/grapher-schema.005.json",
# "chartTypes": ["LineChart"],
# "hasMapTab": True,
# "tab": "map",
# "map": {
# "colorScale": {"baseColorScheme": "YlGbBu"},
# },
# },
# )
10 changes: 5 additions & 5 deletions snapshots/us_census_bureau/2025-02-26/diphtheria_deaths.csv.dvc
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ meta:

* 1943: [US Census Bureau “Statistical Abstract of the United States: 1945”](https://www.census.gov/library/publications/1945/compendia/statab/66ed.html)

* 1944: [US Census Bureau “Statistical Abstract of the United States: 1946”](https://www.census.gov/library/publications/1934/compendia/statab/67ed.html)
* 1944: [US Census Bureau “Statistical Abstract of the United States: 1946”](https://www.census.gov/library/publications/1946/compendia/statab/67ed.html)

* 1945: [US Census Bureau “Statistical Abstract of the United States: 1947”](https://www.census.gov/library/publications/1935/compendia/statab/68ed.html)
* 1945: [US Census Bureau “Statistical Abstract of the United States: 1947”](https://www.census.gov/library/publications/1947/compendia/statab/68ed.html)

* 1946: [US Census Bureau “Statistical Abstract of the United States: 1948”](https://www.census.gov/library/publications/1936/compendia/statab/69ed.html)
* 1946: [US Census Bureau “Statistical Abstract of the United States: 1948”](https://www.census.gov/library/publications/1948/compendia/statab/69ed.html)

* 1947: [US Census Bureau “Statistical Abstract of the United States: 1949”](https://www.census.gov/library/publications/1938/compendia/statab/70ed.html)
* 1947: [US Census Bureau “Statistical Abstract of the United States: 1949”](https://www.census.gov/library/publications/1949/compendia/statab/70ed.html)

* 1948: [US Census Bureau “Statistical Abstract of the United States: 1950”](https://www.census.gov/library/publications/1939/compendia/statab/71ed.html)
* 1948: [US Census Bureau “Statistical Abstract of the United States: 1950”](https://www.census.gov/library/publications/1950/compendia/statab/71ed.html)

* 1949: [US Census Bureau “Statistical Abstract of the United States: 1952”](https://www.census.gov/library/publications/1952/compendia/statab/73ed.html)

Expand Down
Loading

0 comments on commit a114e05

Please sign in to comment.