✨ OMM grapher step

owid · Mar 7, 2025 · b17735f · b17735f
1 parent ea222f7
commit b17735f
Show file tree

Hide file tree

Showing 7 changed files with 223 additions and 5 deletions.
diff --git a/dag/redistribution.yml b/dag/redistribution.yml
@@ -29,6 +29,16 @@ steps:
   data://grapher/oecd/2025-02-25/social_expenditure:
     - data://garden/oecd/2025-02-25/social_expenditure
 
+  #
+  # Social expenditure OMM
+  #
+  data://garden/social_expenditure/2025-03-07/social_expenditure_omm:
+    - data://garden/oecd/2025-02-25/social_expenditure
+    - data://garden/oecd/2025-03-07/social_expenditure_1985
+    - data://garden/social_expenditure/2025-03-07/lindert
+  data://grapher/social_expenditure/2025-03-07/social_expenditure_omm:
+    - data://garden/social_expenditure/2025-03-07/social_expenditure_omm
+
   #
   # Social transfers 1880-1930 (Lindert, 1994)
   #

diff --git a/etl/steps/data/garden/oecd/2025-02-25/social_expenditure.meta.yml b/etl/steps/data/garden/oecd/2025-02-25/social_expenditure.meta.yml
@@ -114,7 +114,7 @@ dataset:
 tables:
   social_expenditure:
     variables:
-      share_of_gdp:
+      share_gdp:
         title: Social expenditure as a share of GDP - <<expenditure_source>> - <<spending_type>> - <<programme_type_category>> programs (<<programme_type>>)
         unit: "% of GDP"
         short_unit: "%"
@@ -131,7 +131,7 @@ tables:
           numDecimalPlaces: 1
           tolerance: 5
 
-      share_of_gov_expenditure:
+      share_gov_expenditure:
         title: Social expenditure as a share of government expenditure - <<expenditure_source>> - <<spending_type>> - <<programme_type_category>> programs (<<programme_type>>)
         unit: "% of government expenditure"
         short_unit: "%"

diff --git a/etl/steps/data/garden/oecd/2025-02-25/social_expenditure.py b/etl/steps/data/garden/oecd/2025-02-25/social_expenditure.py
@@ -8,8 +8,8 @@
 
 # Define indicator columns and their new names.
 INDICATOR_COLUMNS = {
-    "Percentage of GDP": "share_of_gdp",
-    "Percentage of general government expenditure": "share_of_gov_expenditure",
+    "Percentage of GDP": "share_gdp",
+    "Percentage of general government expenditure": "share_gov_expenditure",
     "US dollars per person, PPP converted": "usd_per_person_ppp",
 }
 

diff --git a/etl/steps/data/garden/social_expenditure/2025-03-07/social_expenditure_omm.meta.yml b/etl/steps/data/garden/social_expenditure/2025-03-07/social_expenditure_omm.meta.yml
@@ -0,0 +1,40 @@
+# NOTE: To learn more about the fields, hover over their names.
+definitions:
+  common:
+    presentation:
+      topic_tags:
+        - Government Spending
+
+
+# Learn more about the available fields:
+# http://docs.owid.io/projects/etl/architecture/metadata/reference/
+dataset:
+  update_period_days: 365
+  title: Social expenditure in the long run
+
+
+tables:
+  social_expenditure_omm:
+    variables:
+      share_gdp:
+        title: Public social expenditure as a share of GDP
+        unit: "% of GDP"
+        short_unit: "%"
+        description_short: Public social expenditure divided bt gross domestic product, expressed as a percentage.
+        description_key:
+          - "This indicator combines three different datasets: Lindert (1994), OECD (1985), and the OECD Social Expenditure Database (SOCX). We combine the two OECD datasets by using the implicit growth rate from the older series, so we can extend the series back to 1960. We also use the data from Lindert (1994) to extend the series to 1880."
+        description_from_producer: ""
+        processing_level: major
+        description_processing: |-
+          We extrapolated the data available from the OECD Social Expenditure Database (Government/compulsory schemes) using the earliest available observation from this dataset and applying the growth rates implied by the OECD (1985) data to obtain a series starting in 1960. These steps are necessary because the data in these years is not exactly the same for the two datasets due to changes in definitions and measurement, though the trends are consistent for common years (1970-1991).
+
+          We don't transform the data from Lindert (1994), the values are the same as in the original source.
+        presentation:
+          attribution_short: Lindert, OECD
+          title_public: Public social expenditure as a share of GDP
+          title_variant: Historical data
+        display:
+          name: Public social expenditure as a share of GDP
+          numDecimalPlaces: 1
+          tolerance: 5
+
diff --git a/etl/steps/data/garden/social_expenditure/2025-03-07/social_expenditure_omm.py b/etl/steps/data/garden/social_expenditure/2025-03-07/social_expenditure_omm.py
@@ -0,0 +1,142 @@
+"""Load a meadow dataset and create a garden dataset."""
+
+import owid.catalog.processing as pr
+from owid.catalog import Table
+
+from etl.helpers import PathFinder
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+# Define category to select from OECD Social Expenditure Database
+EXPENDITURE_SOURCE_OECD = "Public"
+SPENDING_TYPE_OECD = "In-cash and in-kind spending"
+PROGRAMME_TYPE_OECD = "All"
+
+
+def run() -> None:
+    #
+    # Load inputs.
+    #
+    # Load meadow dataset.
+    ds_oecd = paths.load_dataset("social_expenditure")
+    ds_oecd_1985 = paths.load_dataset("social_expenditure_1985")
+    ds_lindert = paths.load_dataset("lindert")
+
+    # Read table from meadow dataset.
+    tb_oecd = ds_oecd.read("social_expenditure")
+    tb_oecd_1985 = ds_oecd_1985.read("social_expenditure_1985")
+    tb_lindert = ds_lindert.read("lindert")
+
+    #
+    # Process data.
+    #
+    # Select the right categories from the OECD SOCX dataset
+    tb_oecd = tb_oecd[
+        (tb_oecd["expenditure_source"] == EXPENDITURE_SOURCE_OECD)
+        & (tb_oecd["spending_type"] == SPENDING_TYPE_OECD)
+        & (tb_oecd["programme_type_category"] == PROGRAMME_TYPE_OECD)
+    ].reset_index(drop=True)
+
+    # Keep only the necessary columns
+    tb_oecd = tb_oecd[["country", "year", "share_gdp"]]
+
+    # Save the countries available in the OECD dataset
+    countries_oecd = list(tb_oecd["country"].unique())
+
+    # Merge the three tables
+    tb = pr.merge(tb_oecd, tb_oecd_1985, on=["country", "year"], how="outer", suffixes=("", "_oecd_1985"))
+    tb = pr.merge(tb, tb_lindert, on=["country", "year"], how="outer", suffixes=("", "_lindert"))
+
+    # Rename share_gdp to share_gdp_oecd
+    tb = tb.rename(columns={"share_gdp": "share_gdp_oecd"})
+
+    # Keep only countries available in the OECD dataset
+    tb = tb[tb["country"].isin(countries_oecd)].reset_index(drop=True)
+
+    # Merge the three series, by applying the growth retroactively
+    tb = create_estimations_from_growth(tb=tb, reference_var_suffix="_oecd_1985", to_adjust_var_suffix="_oecd")
+
+    # Fill data from Lindert where there is no data in share_gdp
+    tb["share_gdp"] = tb["share_gdp"].fillna(tb["share_gdp_lindert"])
+
+    # Keep only the necessary columns
+    tb = tb[["country", "year", "share_gdp"]]
+
+    # Improve table format.
+    tb = tb.format(["country", "year"], short_name="social_expenditure_omm")
+
+    #
+    # Save outputs.
+    #
+    # Initialize a new garden dataset.
+    ds_garden = paths.create_dataset(tables=[tb], default_metadata=ds_oecd.metadata)
+
+    # Save garden dataset.
+    ds_garden.save()
+
+
+def create_estimations_from_growth(tb: Table, reference_var_suffix: str, to_adjust_var_suffix: str) -> Table:
+    """
+    Adjust estimations of variables according to the growth of a reference variable.
+
+    Parameters
+    ----------
+    tb : Table
+        Table that contains both the reference variable (the one the growth is extracted from) and the variable to be adjusted (the one the growth is applied to).
+    reference_var_suffix : str
+        Suffix of the reference variable (the one the growth is extracted from). In this project, "_mpd" or "_md".
+    to_adjust_var_suffix : str
+        Suffix of the variable to be adjusted (the one the growth is applied to). In this project, "_wdi" or "".
+
+    Returns
+    -------
+    tb : Table
+        Table with the adjusted variables.
+    """
+
+    # Save the original columns
+    columns_list = list(tb.columns)
+
+    # Sort by country and year
+    tb = tb.sort_values(by=["country", "year"]).reset_index(drop=True)
+
+    # Define the first year in common between the two series, share_gdp{reference_var_suffix} and share_gdp{to_adjust_var_suffix}
+    # First, define all the years in common between the two series
+    tb["years_in_common"] = tb.loc[
+        tb[f"share_gdp{reference_var_suffix}"].notnull() & tb[f"share_gdp{to_adjust_var_suffix}"].notnull(), "year"
+    ]
+
+    # Define the first year in common
+    tb["reference_year"] = tb.groupby("country")["years_in_common"].transform("min")
+
+    # Get value from the reference variable in the reference year
+    tb["reference_value"] = tb.groupby("country")[f"share_gdp{reference_var_suffix}"].transform(
+        lambda x: x.loc[tb["year"] == tb["reference_year"]].iloc[0]
+        if not x.loc[tb["year"] == tb["reference_year"]].empty
+        else None
+    )
+
+    # The scalar is the previous value divided by the reference variable. This is the growth that will be applied retroactively to the variable to be adjusted.
+    tb["share_gdp_scalar"] = tb[f"share_gdp{reference_var_suffix}"] / tb["reference_value"]
+
+    # Get value to be adjusted in the reference year
+    tb["to_adjust_value"] = tb.groupby("country")[f"share_gdp{to_adjust_var_suffix}"].transform(
+        lambda x: x.loc[tb["year"] == tb["reference_year"]].iloc[0]
+        if not x.loc[tb["year"] == tb["reference_year"]].empty
+        else None
+    )
+
+    # The estimated values are the division between the reference value and the scalars. This is the variable to be adjusted effectively adjusted by the growth of the reference variable.
+    tb["share_gdp_estimated"] = tb["to_adjust_value"] * tb["share_gdp_scalar"]
+
+    # Rename the estimated variables without the suffix
+    tb["share_gdp"] = tb[f"share_gdp{to_adjust_var_suffix}"].astype("Float64").fillna(tb["share_gdp_estimated"])
+
+    # Keep only new variables
+    if "share_gdp" not in columns_list:
+        columns_list.append("share_gdp")
+
+    tb = tb[columns_list]
+
+    return tb
diff --git a/etl/steps/data/grapher/social_expenditure/2025-03-07/social_expenditure_omm.py b/etl/steps/data/grapher/social_expenditure/2025-03-07/social_expenditure_omm.py
@@ -0,0 +1,26 @@
+"""Load a garden dataset and create a grapher dataset."""
+
+from etl.helpers import PathFinder
+
+# Get paths and naming conventions for current step.
+paths = PathFinder(__file__)
+
+
+def run() -> None:
+    #
+    # Load inputs.
+    #
+    # Load garden dataset.
+    ds_garden = paths.load_dataset("social_expenditure_omm")
+
+    # Read table from garden dataset.
+    tb = ds_garden.read("social_expenditure_omm", reset_index=False)
+
+    #
+    # Save outputs.
+    #
+    # Initialize a new grapher dataset.
+    ds_grapher = paths.create_dataset(tables=[tb], default_metadata=ds_garden.metadata)
+
+    # Save grapher dataset.
+    ds_grapher.save()
diff --git a/etl/steps/data/meadow/oecd/2025-03-07/social_expenditure_1985.py b/etl/steps/data/meadow/oecd/2025-03-07/social_expenditure_1985.py
@@ -42,7 +42,7 @@ def run() -> None:
     tb = tb.drop(columns=["level_0"])
 
     # Fill missing rows in year with the first non-missing value
-    tb["year"] = tb["year"].bfill()
+    tb["year"] = tb["year"].ffill()
 
     # Drop the country value Unnamed: 0_level_0
     tb = tb[tb["country"] != "Unnamed: 0_level_0"].reset_index(drop=True)