Skip to content

Commit

Permalink
✨ OMM grapher step
Browse files Browse the repository at this point in the history
  • Loading branch information
paarriagadap committed Mar 7, 2025
1 parent ea222f7 commit b17735f
Show file tree
Hide file tree
Showing 7 changed files with 223 additions and 5 deletions.
10 changes: 10 additions & 0 deletions dag/redistribution.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ steps:
data://grapher/oecd/2025-02-25/social_expenditure:
- data://garden/oecd/2025-02-25/social_expenditure

#
# Social expenditure OMM
#
data://garden/social_expenditure/2025-03-07/social_expenditure_omm:
- data://garden/oecd/2025-02-25/social_expenditure
- data://garden/oecd/2025-03-07/social_expenditure_1985
- data://garden/social_expenditure/2025-03-07/lindert
data://grapher/social_expenditure/2025-03-07/social_expenditure_omm:
- data://garden/social_expenditure/2025-03-07/social_expenditure_omm

#
# Social transfers 1880-1930 (Lindert, 1994)
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ dataset:
tables:
social_expenditure:
variables:
share_of_gdp:
share_gdp:
title: Social expenditure as a share of GDP - <<expenditure_source>> - <<spending_type>> - <<programme_type_category>> programs (<<programme_type>>)
unit: "% of GDP"
short_unit: "%"
Expand All @@ -131,7 +131,7 @@ tables:
numDecimalPlaces: 1
tolerance: 5

share_of_gov_expenditure:
share_gov_expenditure:
title: Social expenditure as a share of government expenditure - <<expenditure_source>> - <<spending_type>> - <<programme_type_category>> programs (<<programme_type>>)
unit: "% of government expenditure"
short_unit: "%"
Expand Down
4 changes: 2 additions & 2 deletions etl/steps/data/garden/oecd/2025-02-25/social_expenditure.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@

# Define indicator columns and their new names.
INDICATOR_COLUMNS = {
"Percentage of GDP": "share_of_gdp",
"Percentage of general government expenditure": "share_of_gov_expenditure",
"Percentage of GDP": "share_gdp",
"Percentage of general government expenditure": "share_gov_expenditure",
"US dollars per person, PPP converted": "usd_per_person_ppp",
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Government Spending


# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365
title: Social expenditure in the long run


tables:
social_expenditure_omm:
variables:
share_gdp:
title: Public social expenditure as a share of GDP
unit: "% of GDP"
short_unit: "%"
description_short: Public social expenditure divided bt gross domestic product, expressed as a percentage.
description_key:
- "This indicator combines three different datasets: Lindert (1994), OECD (1985), and the OECD Social Expenditure Database (SOCX). We combine the two OECD datasets by using the implicit growth rate from the older series, so we can extend the series back to 1960. We also use the data from Lindert (1994) to extend the series to 1880."
description_from_producer: ""
processing_level: major
description_processing: |-
We extrapolated the data available from the OECD Social Expenditure Database (Government/compulsory schemes) using the earliest available observation from this dataset and applying the growth rates implied by the OECD (1985) data to obtain a series starting in 1960. These steps are necessary because the data in these years is not exactly the same for the two datasets due to changes in definitions and measurement, though the trends are consistent for common years (1970-1991).
We don't transform the data from Lindert (1994), the values are the same as in the original source.
presentation:
attribution_short: Lindert, OECD
title_public: Public social expenditure as a share of GDP
title_variant: Historical data
display:
name: Public social expenditure as a share of GDP
numDecimalPlaces: 1
tolerance: 5

Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
"""Load a meadow dataset and create a garden dataset."""

import owid.catalog.processing as pr
from owid.catalog import Table

from etl.helpers import PathFinder

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)

# Define category to select from OECD Social Expenditure Database
EXPENDITURE_SOURCE_OECD = "Public"
SPENDING_TYPE_OECD = "In-cash and in-kind spending"
PROGRAMME_TYPE_OECD = "All"


def run() -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_oecd = paths.load_dataset("social_expenditure")
ds_oecd_1985 = paths.load_dataset("social_expenditure_1985")
ds_lindert = paths.load_dataset("lindert")

# Read table from meadow dataset.
tb_oecd = ds_oecd.read("social_expenditure")
tb_oecd_1985 = ds_oecd_1985.read("social_expenditure_1985")
tb_lindert = ds_lindert.read("lindert")

#
# Process data.
#
# Select the right categories from the OECD SOCX dataset
tb_oecd = tb_oecd[
(tb_oecd["expenditure_source"] == EXPENDITURE_SOURCE_OECD)
& (tb_oecd["spending_type"] == SPENDING_TYPE_OECD)
& (tb_oecd["programme_type_category"] == PROGRAMME_TYPE_OECD)
].reset_index(drop=True)

# Keep only the necessary columns
tb_oecd = tb_oecd[["country", "year", "share_gdp"]]

# Save the countries available in the OECD dataset
countries_oecd = list(tb_oecd["country"].unique())

# Merge the three tables
tb = pr.merge(tb_oecd, tb_oecd_1985, on=["country", "year"], how="outer", suffixes=("", "_oecd_1985"))
tb = pr.merge(tb, tb_lindert, on=["country", "year"], how="outer", suffixes=("", "_lindert"))

# Rename share_gdp to share_gdp_oecd
tb = tb.rename(columns={"share_gdp": "share_gdp_oecd"})

# Keep only countries available in the OECD dataset
tb = tb[tb["country"].isin(countries_oecd)].reset_index(drop=True)

# Merge the three series, by applying the growth retroactively
tb = create_estimations_from_growth(tb=tb, reference_var_suffix="_oecd_1985", to_adjust_var_suffix="_oecd")

# Fill data from Lindert where there is no data in share_gdp
tb["share_gdp"] = tb["share_gdp"].fillna(tb["share_gdp_lindert"])

# Keep only the necessary columns
tb = tb[["country", "year", "share_gdp"]]

# Improve table format.
tb = tb.format(["country", "year"], short_name="social_expenditure_omm")

#
# Save outputs.
#
# Initialize a new garden dataset.
ds_garden = paths.create_dataset(tables=[tb], default_metadata=ds_oecd.metadata)

# Save garden dataset.
ds_garden.save()


def create_estimations_from_growth(tb: Table, reference_var_suffix: str, to_adjust_var_suffix: str) -> Table:
"""
Adjust estimations of variables according to the growth of a reference variable.
Parameters
----------
tb : Table
Table that contains both the reference variable (the one the growth is extracted from) and the variable to be adjusted (the one the growth is applied to).
reference_var_suffix : str
Suffix of the reference variable (the one the growth is extracted from). In this project, "_mpd" or "_md".
to_adjust_var_suffix : str
Suffix of the variable to be adjusted (the one the growth is applied to). In this project, "_wdi" or "".
Returns
-------
tb : Table
Table with the adjusted variables.
"""

# Save the original columns
columns_list = list(tb.columns)

# Sort by country and year
tb = tb.sort_values(by=["country", "year"]).reset_index(drop=True)

# Define the first year in common between the two series, share_gdp{reference_var_suffix} and share_gdp{to_adjust_var_suffix}
# First, define all the years in common between the two series
tb["years_in_common"] = tb.loc[
tb[f"share_gdp{reference_var_suffix}"].notnull() & tb[f"share_gdp{to_adjust_var_suffix}"].notnull(), "year"
]

# Define the first year in common
tb["reference_year"] = tb.groupby("country")["years_in_common"].transform("min")

# Get value from the reference variable in the reference year
tb["reference_value"] = tb.groupby("country")[f"share_gdp{reference_var_suffix}"].transform(
lambda x: x.loc[tb["year"] == tb["reference_year"]].iloc[0]
if not x.loc[tb["year"] == tb["reference_year"]].empty
else None
)

# The scalar is the previous value divided by the reference variable. This is the growth that will be applied retroactively to the variable to be adjusted.
tb["share_gdp_scalar"] = tb[f"share_gdp{reference_var_suffix}"] / tb["reference_value"]

# Get value to be adjusted in the reference year
tb["to_adjust_value"] = tb.groupby("country")[f"share_gdp{to_adjust_var_suffix}"].transform(
lambda x: x.loc[tb["year"] == tb["reference_year"]].iloc[0]
if not x.loc[tb["year"] == tb["reference_year"]].empty
else None
)

# The estimated values are the division between the reference value and the scalars. This is the variable to be adjusted effectively adjusted by the growth of the reference variable.
tb["share_gdp_estimated"] = tb["to_adjust_value"] * tb["share_gdp_scalar"]

# Rename the estimated variables without the suffix
tb["share_gdp"] = tb[f"share_gdp{to_adjust_var_suffix}"].astype("Float64").fillna(tb["share_gdp_estimated"])

# Keep only new variables
if "share_gdp" not in columns_list:
columns_list.append("share_gdp")

tb = tb[columns_list]

return tb
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Load a garden dataset and create a grapher dataset."""

from etl.helpers import PathFinder

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run() -> None:
#
# Load inputs.
#
# Load garden dataset.
ds_garden = paths.load_dataset("social_expenditure_omm")

# Read table from garden dataset.
tb = ds_garden.read("social_expenditure_omm", reset_index=False)

#
# Save outputs.
#
# Initialize a new grapher dataset.
ds_grapher = paths.create_dataset(tables=[tb], default_metadata=ds_garden.metadata)

# Save grapher dataset.
ds_grapher.save()
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def run() -> None:
tb = tb.drop(columns=["level_0"])

# Fill missing rows in year with the first non-missing value
tb["year"] = tb["year"].bfill()
tb["year"] = tb["year"].ffill()

# Drop the country value Unnamed: 0_level_0
tb = tb[tb["country"] != "Unnamed: 0_level_0"].reset_index(drop=True)
Expand Down

0 comments on commit b17735f

Please sign in to comment.