From eee12bcd9a203ea2913c0119e9213972310b06fb Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 11:45:55 +0100 Subject: [PATCH 01/10] =?UTF-8?q?=F0=9F=93=8A=20update=20eiu=20data?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From e46995e9d0fa21456a189034df2754724fdb2cef Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 11:48:50 +0100 Subject: [PATCH 02/10] =?UTF-8?q?=E2=9C=A8=20Add=20show=5Ftime=20option=20?= =?UTF-8?q?to=20spinners=20in=20dashboard=20operations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- apps/wizard/app_pages/dashboard/operations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/wizard/app_pages/dashboard/operations.py b/apps/wizard/app_pages/dashboard/operations.py index 71e6b876a36..1c8c1414894 100644 --- a/apps/wizard/app_pages/dashboard/operations.py +++ b/apps/wizard/app_pages/dashboard/operations.py @@ -68,7 +68,7 @@ def render_action_update(): st.error("The update command is not available in production. Update steps locally or in staging.") st.stop() else: - with st.spinner("Executing step updater..."): + with st.spinner("Executing step updater...", show_time=True): # TODO: It would be better to directly use StepUpdater instead of a subprocess. command = ( "etl update " @@ -132,7 +132,7 @@ def render_action_execute(steps_df: pd.DataFrame): st.error("Running the ETL is not available in production. Run them locally or in staging.") st.stop() else: - with st.spinner("Executing ETL..."): + with st.spinner("Executing ETL...", show_time=True): command = _define_command_to_execute_snapshots_and_etl_steps( steps_df=steps_df, dry_run=dry_run_etl, @@ -181,7 +181,7 @@ def render_action_archive(): st.error("Archiving is not available in production. Run them locally or in staging.") st.stop() else: - with st.spinner("Archiving steps..."): + with st.spinner("Archiving steps...", show_time=True): command = "etl archive " + " ".join(st.session_state.selected_steps) + " --non-interactive" if dry_run_archive: command += " --dry-run" From bebe5c0d2467fc046adb84581fe5149ad512a3cb Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 11:50:03 +0100 Subject: [PATCH 03/10] wip --- dag/democracy.yml | 12 + .../democracy/2025-03-05/eiu.countries.json | 181 +++++++ .../garden/democracy/2025-03-05/eiu.meta.yml | 116 +++++ .../data/garden/democracy/2025-03-05/eiu.py | 297 +++++++++++ .../garden/democracy/2025-03-05/shared.py | 489 ++++++++++++++++++ .../data/grapher/democracy/2025-03-05/eiu.py | 34 ++ .../data/meadow/democracy/2025-03-05/eiu.py | 123 +++++ 7 files changed, 1252 insertions(+) create mode 100644 etl/steps/data/garden/democracy/2025-03-05/eiu.countries.json create mode 100644 etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml create mode 100644 etl/steps/data/garden/democracy/2025-03-05/eiu.py create mode 100644 etl/steps/data/garden/democracy/2025-03-05/shared.py create mode 100644 etl/steps/data/grapher/democracy/2025-03-05/eiu.py create mode 100644 etl/steps/data/meadow/democracy/2025-03-05/eiu.py diff --git a/dag/democracy.yml b/dag/democracy.yml index e8577f42133..f5a97e6fa76 100644 --- a/dag/democracy.yml +++ b/dag/democracy.yml @@ -88,3 +88,15 @@ steps: - data://garden/demography/2023-03-31/population data://grapher/democracy/2024-05-22/eiu: - data://garden/democracy/2024-03-07/eiu + # EIU (2024) + data://meadow/democracy/2025-03-05/eiu: + - snapshot://democracy/2024-05-22/eiu_gapminder.csv + - snapshot://democracy/2024-05-22/eiu_2023.csv + - snapshot://democracy/2024-05-22/eiu_2021.csv + - snapshot://democracy/2024-05-22/eiu_2022.csv + data://garden/democracy/2025-03-05/eiu: + - data://garden/demography/2024-07-15/population + - data://meadow/democracy/2025-03-05/eiu + - data://garden/regions/2023-01-01/regions + data://grapher/democracy/2025-03-05/eiu: + - data://garden/democracy/2025-03-05/eiu diff --git a/etl/steps/data/garden/democracy/2025-03-05/eiu.countries.json b/etl/steps/data/garden/democracy/2025-03-05/eiu.countries.json new file mode 100644 index 00000000000..e52e0d68bb3 --- /dev/null +++ b/etl/steps/data/garden/democracy/2025-03-05/eiu.countries.json @@ -0,0 +1,181 @@ +{ + "Afghanistan": "Afghanistan", + "Albania": "Albania", + "Algeria": "Algeria", + "Angola": "Angola", + "Argentina": "Argentina", + "Armenia": "Armenia", + "Australia": "Australia", + "Austria": "Austria", + "Azerbaijan": "Azerbaijan", + "Bahrain": "Bahrain", + "Bangladesh": "Bangladesh", + "Belarus": "Belarus", + "Belgium": "Belgium", + "Benin": "Benin", + "Bhutan": "Bhutan", + "Bolivia": "Bolivia", + "Bosnia and Herzegovina": "Bosnia and Herzegovina", + "Botswana": "Botswana", + "Brazil": "Brazil", + "Bulgaria": "Bulgaria", + "Burkina Faso": "Burkina Faso", + "Burundi": "Burundi", + "Cabo Verde": "Cape Verde", + "Cambodia": "Cambodia", + "Cameroon": "Cameroon", + "Canada": "Canada", + "Cape Verde": "Cape Verde", + "Central African Republic": "Central African Republic", + "Chad": "Chad", + "Chile": "Chile", + "China": "China", + "Colombia": "Colombia", + "Comoros": "Comoros", + "Congo, Dem. Rep.": "Democratic Republic of Congo", + "Congo, Rep.": "Congo", + "Costa Rica": "Costa Rica", + "Cote d'Ivoire": "Cote d'Ivoire", + "Croatia": "Croatia", + "Cuba": "Cuba", + "Cyprus": "Cyprus", + "Czech Republic": "Czechia", + "C\u00f4te d'Ivoire": "Cote d'Ivoire", + "C\u00f4te d\u2019Ivoire": "Cote d'Ivoire", + "Democratic Republic of Congo": "Democratic Republic of Congo", + "Denmark": "Denmark", + "Djibouti": "Djibouti", + "Dominican Republic": "Dominican Republic", + "Ecuador": "Ecuador", + "Egypt": "Egypt", + "El Salvador": "El Salvador", + "Equatorial Guinea": "Equatorial Guinea", + "Eritrea": "Eritrea", + "Estonia": "Estonia", + "Eswatini": "Eswatini", + "Ethiopia": "Ethiopia", + "Fiji": "Fiji", + "Finland": "Finland", + "France": "France", + "Gabon": "Gabon", + "Gambia": "Gambia", + "Georgia": "Georgia", + "Germany": "Germany", + "Ghana": "Ghana", + "Greece": "Greece", + "Guatemala": "Guatemala", + "Guinea": "Guinea", + "Guinea-Bissau": "Guinea-Bissau", + "Guyana": "Guyana", + "Haiti": "Haiti", + "Honduras": "Honduras", + "Hong Kong": "Hong Kong", + "Hong Kong, China": "Hong Kong", + "Hungary": "Hungary", + "Iceland": "Iceland", + "India": "India", + "Indonesia": "Indonesia", + "Iran": "Iran", + "Iraq": "Iraq", + "Ireland": "Ireland", + "Israel": "Israel", + "Italy": "Italy", + "Jamaica": "Jamaica", + "Japan": "Japan", + "Jordan": "Jordan", + "Kazakhstan": "Kazakhstan", + "Kenya": "Kenya", + "Kuwait": "Kuwait", + "Kyrgyz Republic": "Kyrgyzstan", + "Lao": "Laos", + "Laos": "Laos", + "Latvia": "Latvia", + "Lebanon": "Lebanon", + "Lesotho": "Lesotho", + "Liberia": "Liberia", + "Libya": "Libya", + "Lithuania": "Lithuania", + "Luxembourg": "Luxembourg", + "Macedonia, FYR": "North Macedonia", + "Madagascar": "Madagascar", + "Malawi": "Malawi", + "Malaysia": "Malaysia", + "Mali": "Mali", + "Malta": "Malta", + "Mauritania": "Mauritania", + "Mauritius": "Mauritius", + "Mexico": "Mexico", + "Moldova": "Moldova", + "Mongolia": "Mongolia", + "Montenegro": "Montenegro", + "Morocco": "Morocco", + "Mozambique": "Mozambique", + "Myanmar": "Myanmar", + "Namibia": "Namibia", + "Nepal": "Nepal", + "Netherlands": "Netherlands", + "New Zealand": "New Zealand", + "Nicaragua": "Nicaragua", + "Niger": "Niger", + "Nigeria": "Nigeria", + "North Korea": "North Korea", + "North Macedonia": "North Macedonia", + "Norway": "Norway", + "Oman": "Oman", + "Pakistan": "Pakistan", + "Palestine": "Palestine", + "Panama": "Panama", + "Papua New Guinea": "Papua New Guinea", + "Paraguay": "Paraguay", + "Peru": "Peru", + "Philippines": "Philippines", + "Poland": "Poland", + "Portugal": "Portugal", + "Qatar": "Qatar", + "Romania": "Romania", + "Russia": "Russia", + "Rwanda": "Rwanda", + "Saudi Arabia": "Saudi Arabia", + "Senegal": "Senegal", + "Serbia": "Serbia", + "Sierra Leone": "Sierra Leone", + "Singapore": "Singapore", + "Slovak Republic": "Slovakia", + "Slovakia": "Slovakia", + "Slovenia": "Slovenia", + "South Africa": "South Africa", + "South Korea": "South Korea", + "Spain": "Spain", + "Sri Lanka": "Sri Lanka", + "Sudan": "Sudan", + "Suriname": "Suriname", + "Swaziland": "Eswatini", + "Sweden": "Sweden", + "Switzerland": "Switzerland", + "Syria": "Syria", + "Taiwan": "Taiwan", + "Tajikistan": "Tajikistan", + "Tanzania": "Tanzania", + "Thailand": "Thailand", + "Timor-Leste": "East Timor", + "Togo": "Togo", + "Trinidad and Tobago": "Trinidad and Tobago", + "Tunisia": "Tunisia", + "Turkey": "Turkey", + "Turkmenistan": "Turkmenistan", + "Uganda": "Uganda", + "Ukraine": "Ukraine", + "United Arab Emirates": "United Arab Emirates", + "United Kingdom": "United Kingdom", + "United States": "United States", + "United States of America": "United States", + "Uruguay": "Uruguay", + "Uzbekistan": "Uzbekistan", + "Venezuela": "Venezuela", + "Vietnam": "Vietnam", + "Yemen": "Yemen", + "Zambia": "Zambia", + "Zimbabwe": "Zimbabwe", + "Bosnia and Hercegovina": "Bosnia and Herzegovina", + "Congo (Brazzaville)": "Congo" +} \ No newline at end of file diff --git a/etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml b/etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml new file mode 100644 index 00000000000..320997b8c70 --- /dev/null +++ b/etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml @@ -0,0 +1,116 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Democracy + + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + title: EIU Democratic Index (2006-2023) + description: |- + The Economist Intelligence Unit's (EIU) Democracy Index provides a snapshot of the state of democracy in 165 independent states and two territories. This covers almost the entire population of the world and the vast majority of the world's states (microstates are excluded). Scored on a 0-10 scale, the Democracy Index is based on five categories: electoral process and pluralism, functioning of government, political participation, political culture, and civil liberties. Based on its scores on a range of indicators within these categories, each country is classified as one of four types of regime: "full democracy", "flawed democracy", "hybrid regime" or "authoritarian regime". + + This dataset was constructed using data collected from various EIU yearly reports and a multi-year compilation by Gapminder. + + +tables: + eiu: + common: + presentation: + attribution: Economist Intelligence Unit (2006-2023) + + variables: + + democracy_eiu: + title: Democracy score + description_short: |- + Extent to which citizens can choose their political leaders in free and fair elections, enjoy civil liberties, prefer democracy over other political systems, can and do participate in politics, and have a functioning government that acts on their behalf. It ranges from 0 to 10 (most democratic). + description_processing: |- + Values for continents have been obtained by averaging the values of the countries in the continent. + unit: "" + + elect_freefair_eiu: + title: Free and fair elections + description_short: |- + Extent to which all citizens can choose their political leaders in free and fair elections. Higher values indicate more pluralism. + unit: "" + + funct_gov_eiu: + title: Functioning government + description_short: |- + Extent to which citizens have a functioning government that acts on their behalf. It ranges from 0 to 10 (most effective). + unit: "" + + pol_part_eiu: + title: Political participation + description_short: |- + Extent to which citizens can and do participate in politics. It ranges from 0 to 10 (most active). + unit: "" + + dem_culture_eiu: + title: Democratic culture + description_short: |- + Extent to which citizens prefer democracy over other political systems. It ranges from 0 to 10 (strongest preference for democracy). + unit: "" + + civlib_eiu: + title: Civil liberties + description_short: |- + Extent to which citizens enjoy civil liberties. It ranges from 0 to 10 (most liberties). + unit: "" + + regime_eiu: + title: Regime + unit: "" + description_short: |- + Political regime of a country. It distinguishes between authoritarian regimes (score 0), hybrid regimes (score 1), flawed democracies (score 2), and full democracies (score 3). + description_key: + - Full democracies have comprehensive democratic features, with a high extent to which citizens can choose their political leaders in free and fair elections, enjoy civil liberties, prefer democracy over other political systems, can and do participate in politics, and have a functioning government that acts on their behalf. + - Flawed democracies have some weaknesses in democratic institutions and culture. + - Hybrid regimes have serious weaknesses in democratic institutions and culture. + - Authoritarian regimes have few democratic institutions and little democratic culture. + + num_countries: + common: + presentation: + attribution: Economist Intelligence Unit (2006-2023) + + variables: + num_regime_eiu: + title: |- + <%- if category == '-1' -%> + Number of countries with unknown regime + <%- else -%> + Number of << category.replace('cracy', 'cracies').replace('regime', 'regimes') >> + <%- endif -%> + unit: "countries" + + num_people: + common: + presentation: + attribution: Economist Intelligence Unit (2006-2023); Population based on various sources (2023) + variables: + pop_regime_eiu: + title: |- + <%- if category == '-1' -%> + Number of people living in countries with unknown regime + <%- else -%> + Number of people living in << category.replace('cracy', 'cracies').replace('regime', 'regimes') >> + <%- endif -%> + description_short: "Number of countries with available data." + unit: "people" + + + avg_pop: + common: + presentation: + attribution: Economist Intelligence Unit (2006-2023); Population based on various sources (2023) + variables: + democracy_eiu_weighted: + title: "Democracy score (population-weighted)" + description_short: Average democratic status, weighted by population. + unit: "" diff --git a/etl/steps/data/garden/democracy/2025-03-05/eiu.py b/etl/steps/data/garden/democracy/2025-03-05/eiu.py new file mode 100644 index 00000000000..5c67de93a7a --- /dev/null +++ b/etl/steps/data/garden/democracy/2025-03-05/eiu.py @@ -0,0 +1,297 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from typing import Tuple, cast + +import numpy as np +import pandas as pd +from owid.catalog import Dataset, Table +from owid.catalog.tables import concat +from shared import ( + add_population_in_dummies, + add_regions_and_global_aggregates, + expand_observations, + from_wide_to_long, + make_table_with_dummies, +) + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) +# Missing classifications of states +REGIONS = { + "Africa": {}, + "Asia": {}, + "North America": {}, + "South America": {}, + "Europe": {}, + "Oceania": {}, +} +# Year range +YEAR_MIN = 2006 +YEAR_MAX = 2023 + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("eiu") + ds_regions = paths.load_dataset("regions") + ds_population = paths.load_dataset("population") + + # Read table from meadow dataset. + tb = ds_meadow["eiu"].reset_index() + + # + # Process data. + # + tb = geo.harmonize_countries( + df=tb, + countries_file=paths.country_mapping_path, + ) + + # Remove years with interpolated data (2007 and 2009 are interpolated by Gapminder) + tb = tb[~tb["year"].isin([2007, 2009])] + + # Drop rank column + tb = tb.drop(columns=["rank_eiu"]) + tb = cast(Table, tb) + + tb = add_regime_identifier(tb) + + ################################################## + # AGGREGATES + # Get country-count-related data: country-averages, number of countries, ... + tb_num_countries, tb_avg_countries = get_country_data(tb, ds_regions) + + # Get population-related data: population-weighed averages, people livin in ... + tb_num_people, tb_avg_w_countries = get_population_data(tb, ds_regions, ds_population) + ################################################## + + # Add regions to main table + tb = concat([tb, tb_avg_countries], ignore_index=True) + + # + # Save outputs. + # + tables = [ + tb.format(["country", "year"]), + tb_num_countries.format(["country", "year", "category"], short_name="num_countries"), + tb_num_people.format(["country", "year", "category"], short_name="num_people"), + tb_avg_w_countries.format(["country", "year"], short_name="avg_pop"), + ] + + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() + + +def add_regime_identifier(tb: Table) -> Table: + """Create regime identifier.""" + # `regime_eiu`: Categorise democracy_eiu into 4 groups + bins = [ + -0.01, + 4, + 6, + 8, + 10, + ] + labels = [ + 0, + 1, + 2, + 3, + ] + tb["regime_eiu"] = pd.cut(tb["democracy_eiu"], bins=bins, labels=labels) + + # Add metadata + tb["regime_eiu"] = tb["regime_eiu"].copy_metadata(tb["democracy_eiu"]) + return tb + + +def get_country_data(tb: Table, ds_regions: Dataset) -> Tuple[Table, Table]: + """Estimate number of countries in each regime, and country-average for some indicators. + + Returns two tables: + + 1) tb_num_countres: Counts countries in different regimes + regime_eiu (counts) + - Number of authoritarian regimes + - Number of hybrid regimes + - Number of flawed democracies + - Number of full democracies + + 2) tb_avg_countries: Country-average for some indicators + - democracy_eiu (country-average) + + """ + # 1/ COUNT COUNTRIES + # Keep only non-imputed data + tb_num = tb.copy() + + # Set INTs + tb_num = tb_num.astype( + { + "regime_eiu": "Int64", + } + ) + tb_num = cast(Table, tb_num) + + # Define columns on which we will estimate (i) "number of countries" and (ii) "number of people living in ..." + indicators = [ + { + "name": "regime_eiu", + "name_new": "num_regime_eiu", + "values_expected": { + "0": "authoritarian regime", + "1": "hybrid regime", + "2": "flawed democracy", + "3": "full democracy", + }, + "has_na": False, + }, + ] + + # Column per indicator-dimension + tb_num = make_table_with_dummies(tb_num, indicators) + + # Add regions and global aggregates + tb_num = add_regions_and_global_aggregates(tb_num, ds_regions) + tb_num = from_wide_to_long(tb_num) + + # 2/ COUNTRY-AVERAGE INDICATORS + tb_avg = tb.copy() + indicators_avg = ["democracy_eiu"] + + # Keep only relevant columns + tb_avg = tb_avg.loc[:, ["year", "country"] + indicators_avg] + + # Estimate region aggregates + tb_avg = add_regions_and_global_aggregates( + tb=tb_avg, + ds_regions=ds_regions, + aggregations={k: "mean" for k in indicators_avg}, # type: ignore + aggregations_world={k: np.mean for k in indicators_avg}, # type: ignore + ) + + # Keep only certain year range + # tb_avg = tb_avg.loc[tb_avg["year"].between(YEAR_AGG_MIN, YEAR_AGG_MAX)] + + return tb_num, tb_avg + + +def get_population_data(tb: Table, ds_regions: Dataset, ds_population: Dataset) -> Tuple[Table, Table]: + """Estimate people living in each regime, and population-weighted averages for some indicators. + + 1) tb_num_people: People living in different regimes + regime_bti + - Number of hard-line autocracies + - Number of moderate autocracies + - Number of highly defective democracies + - Number of defective democracies + - Number of consolidating democracies + + 2) tb_avg_w_countries: Population-weighted-average for some indicators + - democracy_bti + + """ + # 1/ COUNT PEOPLE + # Keep only non-imputed data + tb_ppl = tb.copy() + + # Set INTs + tb_ppl = tb_ppl.astype( + { + "regime_eiu": "Int64", + } + ) + tb_ppl = cast(Table, tb_ppl) + + indicators = [ + { + "name": "regime_eiu", + "name_new": "pop_regime_eiu", + "values_expected": { + "0": "authoritarian regime", + "1": "hybrid regime", + "2": "flawed democracy", + "3": "full democracy", + }, + "has_na": True, + }, + ] + + ## Get missing years (not to miss anyone!) -- Note that this can lead to country overlaps (e.g. USSR and Latvia) + tb_ppl = expand_observations_without_duplicates(tb_ppl, ds_regions) + print(f"{tb.shape} -> {tb_ppl.shape}") + + # Column per indicator-dimension + tb_ppl = make_table_with_dummies(tb_ppl, indicators) + + # Replace USSR -> current states + # tb_ppl = replace_ussr(tb_ppl, ds_regions) + + ## Counts + tb_ppl = add_population_in_dummies(tb_ppl, ds_population) + tb_ppl = add_regions_and_global_aggregates(tb_ppl, ds_regions) + tb_ppl = from_wide_to_long(tb_ppl) + + # 2/ COUNTRY-AVERAGE INDICATORS + tb_avg = tb.copy() + indicators_avg = ["democracy_eiu"] + + # Keep only relevant columns + tb_avg = tb_avg.loc[:, ["year", "country"] + indicators_avg] + + # Add population in dummies (population value replaces 1, 0 otherwise) + tb_avg = add_population_in_dummies( + tb_avg, + ds_population, + drop_population=False, + ) + + # Get region aggregates + tb_avg = add_regions_and_global_aggregates( + tb=tb_avg, + ds_regions=ds_regions, + aggregations={k: "sum" for k in indicators_avg} | {"population": "sum"}, # type: ignore + min_num_values_per_year=1, + ) + + # Normalize by region's population + columns_index = ["year", "country"] + columns_indicators = [col for col in tb_avg.columns if col not in columns_index + ["population"]] + tb_avg[columns_indicators] = tb_avg[columns_indicators].div(tb_avg["population"], axis=0) + tb_avg = tb_avg.drop(columns="population") + + # Keep only certain year range + # tb_avg = tb_avg.loc[tb_avg["year"].between(YEAR_AGG_MIN, YEAR_AGG_MAX)] + + tb_avg = tb_avg.rename( + columns={ + "democracy_eiu": "democracy_eiu_weighted", + } + ) + return tb_ppl, tb_avg + + +def expand_observations_without_duplicates(tb: Table, ds_regions: Dataset) -> Table: + # Get list of regions + tb_regions = ds_regions["regions"] + countries = set(tb_regions.loc[(tb_regions["region_type"] == "country") & ~(tb_regions["is_historical"]), "name"]) + countries |= set(tb["country"]) + + # Full expansion + tb_exp = expand_observations(tb, countries) + + # Limit years + tb_exp = tb_exp.loc[tb_exp["year"].isin(range(YEAR_MIN, YEAR_MAX + 1, 2))] + + return tb_exp diff --git a/etl/steps/data/garden/democracy/2025-03-05/shared.py b/etl/steps/data/garden/democracy/2025-03-05/shared.py new file mode 100644 index 00000000000..9bcc0b6413a --- /dev/null +++ b/etl/steps/data/garden/democracy/2025-03-05/shared.py @@ -0,0 +1,489 @@ +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, cast + +import numpy as np +import pandas as pd +import yaml +from owid.catalog import Dataset, Table +from owid.catalog.tables import concat + +from etl.data_helpers import geo + +SEPARATOR = "." +# REGION AGGREGATES +REGIONS = { + "Africa": { + "additional_members": [ + "Somaliland", + "Zanzibar", + ] + }, + "Asia": { + "additional_members": [ + "Palestine/Gaza", + "Palestine/West Bank", + ] + }, + "North America": {}, + "South America": {}, + "Europe": { + "additional_members": [ + "Baden", + "Bavaria", + "Brunswick", + "Duchy of Nassau", + "Hamburg", + "Hanover", + "Hesse Electoral", + "Hesse Grand Ducal", + "Mecklenburg Schwerin", + "Modena", + "Oldenburg", + "Parma", + "Piedmont-Sardinia", + "Saxe-Weimar-Eisenach", + "Saxony", + "Tuscany", + "Two Sicilies", + "Wurttemberg", + ] + }, + "Oceania": {}, +} + + +def from_wide_to_long( + tb: Table, + indicator_name_callback: Optional[Callable] = None, + indicator_category_callback: Optional[Callable] = None, + column_dimension_name: str = "category", + separator: str = SEPARATOR, +) -> Table: + """Format a particular shape of table from wide to long format. + + tb: Table with wide format. + indicator_name_callback: Function to extract the indicator name from the column name. + indicator_category_callback: Function to extract the indicator category from the column name. + + If no `indicator_name_callback` and `indicator_category_callback` are provided, it proceed expects the following input: + + | year | country | indicator_a_1 | indicator_a_2 | indicator_b_1 | indicator_b_2 | + |------|---------|---------------|---------------|---------------|---------------| + | 2000 | USA | 1 | 2 | 3 | 4 | + | 2000 | CAN | 5 | 6 | 7 | 8 | + + and then generates the output: + + | year | country | category | indicator_a | indicator_b | + |------|---------|------------|-------------|-------------| + | 2000 | USA | category_1 | 1 | 3 | + | 2000 | USA | category_2 | 2 | 4 | + """ + tb_ = tb.copy() + + # Melt the DataFrame to long format + tb_ = tb_.melt(id_vars=["year", "country"], var_name="indicator_type", value_name="value") + + # Get callables + if indicator_name_callback is None: + + def default_indicator_name(x): + assert len(x.split(separator)) == 2 + return x.split(separator)[0] + + indicator_name_callback = default_indicator_name + + if indicator_category_callback is None: + + def default_indicator_category(x): + assert len(x.split(separator)) == 2 + return x.split(separator)[-1] + + indicator_category_callback = default_indicator_category + + # Extract indicator names and types + tb_["indicator"] = tb_["indicator_type"].apply(indicator_name_callback) + tb_[column_dimension_name] = tb_["indicator_type"].apply(indicator_category_callback) + + # Drop the original 'indicator_type' column as it's no longer needed + tb_.drop("indicator_type", axis=1, inplace=True) + + # Pivot the table to get 'indicator_a' and 'indicator_b' as separate columns + tb_ = tb_.pivot(index=["year", "country", column_dimension_name], columns="indicator", values="value").reset_index() + + # Rename the columns to match your requirements + tb_.columns.name = None # Remove the hierarchy + + return tb_ + + +def expand_observations(tb: Table, regions: Set | None = None) -> Table: + """Expand to have a row per (year, country).""" + # Add missing years for each triplet ("warcode", "campcode", "ccode") + + # List of countries + if regions is None: + regions = set(tb["country"]) + + # List of possible years + years = np.arange(tb["year"].min(), tb["year"].max() + 1) + + # New index + new_idx = pd.MultiIndex.from_product([years, regions], names=["year", "country"]) + + # Reset index + tb = tb.set_index(["year", "country"]).reindex(new_idx).reset_index() + + # Type of `year` + tb["year"] = tb["year"].astype("int") + return tb + + +def add_population_in_dummies( + tb: Table, + ds_population: Dataset, + expected_countries_without_population: Optional[List[str]] = None, + drop_population: bool = True, +): + # Add population column + tb = geo.add_population_to_table( + tb, + ds_population, + interpolate_missing_population=True, + expected_countries_without_population=expected_countries_without_population, + ) + tb = cast(Table, tb.dropna(subset="population")) + # Add metadata (origins combined indicator+population) + cols = [col for col in tb.columns if col not in ["year", "country", "population"]] + meta = {col: tb[col].metadata for col in cols} | {"population": tb["population"].metadata} + ## Encode population in indicators: Population if 1, 0 otherwise + tb[cols] = tb[cols].multiply(tb["population"], axis=0) + if drop_population: + tb = tb.drop(columns="population") + ## Add metadata back (combine origins from population) + for col in cols: + metadata = meta[col] + metadata.origins += meta["population"].origins + tb[col].metadata = meta[col] + + return tb + + +def make_table_with_dummies( + tb: Table, + indicators: List[Dict[str, Any]], + separator: str = SEPARATOR, +) -> Table: + """Format table to have dummy indicators. + + From a table with categorical indicators, create a new table with dummy indicator for each indicator-category pair. + + Example input: + + | year | country | regime | regime_amb | + |------|---------|-----------|------------| + | 2000 | USA | 1 | 0 | + | 2000 | CAN | 0 | 1 | + | 2000 | DEU | NaN | NaN | + + + Example output: + + | year | country | regime.0 | regime.1 | regime.-1 | regime_amb.0 | regime_amb.0 | regime_amb.-1 | + |------|---------|----------|----------|-----------|--------------|--------------|---------------| + | 2000 | USA | 0 | 1 | 0 | 1 | 0 | 0 | + | 2000 | CAN | 1 | 0 | 0 | 0 | 1 | 0 | + | 2000 | DEU | 0 | 0 | 1 | 0 | 0 | 1 | + + Note that '-1' denotes NA (missing value) category. + + The argument `indicators` contains the indicators for which we will create dummies, along with other associated paramters. Example: + + { + "name": "regime_amb_row_owid", + "name_new": "num_countries_regime_amb", + # "values_expected": set(map(str, range(10))), + "values_expected": { + "0": "closed autocracy", + "1": "closed (maybe electoral) autocracy", + "2": "electoral (maybe closed) autocracy", + "3": "electoral autocracy", + "4": "electoral autocracy (maybe electoral democracy)", + "5": "electoral democracy (maybe electoral autocracy)", + "6": "electoral democracy", + "7": "electoral democracy (maybe liberal democracy)", + "8": "liberal democracy (maybe electoral democracy)", + "9": "liberal democracy", + }, + "has_na": True, + } + """ + tb_ = tb.copy() + + # Convert to string + indicator_names = [indicator["name"] for indicator in indicators] + tb_[indicator_names] = tb_[indicator_names].astype("string") + + # Sanity check that the categories for each indicator are as expected + for indicator in indicators: + values_expected = indicator["values_expected"] + # Check and fix NA (convert NAs to -1 category) + if indicator["has_na"]: + # Assert that there are actually NaNs + assert tb_[indicator["name"]].isna().any(), f"No NA found in {indicator['name']}!" + # If NA, we should not have category '-1', otherwise these would get merged! + assert "-1" not in set( + tb_[indicator["name"]].unique() + ), f"Error for indicator `{indicator['name']}`. Found -1, which is not allowed when `has_na=True`!" + tb_[indicator["name"]] = tb_[indicator["name"]].fillna("-1") + # Add '-1' as a possible category + if isinstance(values_expected, dict): + indicator["values_expected"]["-1"] = "-1" + else: + values_expected |= {"-1"} + else: + assert not tb_[indicator["name"]].isna().any(), f"NA found in {indicator['name']}!" + + values_found = set(tb_[indicator["name"]].unique()) + assert values_found == set( + values_expected + ), f"Error for indicator `{indicator['name']}`. Expected {set(values_expected)} but found {values_found}" + + # Rename dimension values + if isinstance(values_expected, dict): + tb_[indicator["name"]] = tb_[indicator["name"]].map(indicator["values_expected"]) + + ## Rename columns + tb_ = tb_.rename( + columns={indicator["name"]: indicator.get("name_new", indicator["name"]) for indicator in indicators} + ) + indicator_names = [indicator.get("name_new", indicator["name"]) for indicator in indicators] + + ## Get dummy indicator table + tb_ = cast(Table, pd.get_dummies(tb_, dummy_na=True, columns=indicator_names, dtype=int, prefix_sep=separator)) + + ## Add missing metadata to dummy indicators + dummy_cols = [] + for indicator in indicators: + name_new = indicator.get("name_new", indicator["name"]) + ## get list of dummy indicator column names + if isinstance(indicator["values_expected"], dict): + dummy_columns = [f"{name_new}{separator}{v}" for v in indicator["values_expected"].values()] + else: + dummy_columns = [f"{name_new}{separator}{v}" for v in indicator["values_expected"]] + ## assign metadata to dummy column indicators + for col in dummy_columns: + tb_[col].metadata = tb[indicator["name"]].metadata + dummy_cols.extend(dummy_columns) + + ### Select subset of columns + tb_ = tb_.loc[:, ["year", "country"] + dummy_cols] + + return tb_ + + +def add_regions_and_global_aggregates( + tb: Table, + ds_regions: Dataset, + regions: Optional[Dict[str, Any]] = None, + aggregations: Optional[Dict[str, str]] = None, + min_num_values_per_year: Optional[int] = None, + aggregations_world: Optional[Dict[str, str]] = None, + short_name: str = "region_counts", +) -> Table: + """Add regions, and world aggregates.""" + # Copy + tb_ = tb.copy() + + # Regions considered + if regions is None: + regions = REGIONS + + # Add regions + tb_regions = geo.add_regions_to_table( + tb_.copy(), + ds_regions, + regions=regions, + aggregations=aggregations, + min_num_values_per_year=min_num_values_per_year, + ) + tb_regions = tb_regions.loc[tb_regions["country"].isin(regions.keys())] + + # Add world + if aggregations_world is None: + tb_world = tb.groupby("year", as_index=False).sum(numeric_only=True, min_count=1).assign(country="World") + else: + tb_world = tb.groupby("year", as_index=False).agg(aggregations_world).assign(country="World") + tb = concat([tb_regions, tb_world], ignore_index=True, short_name="region_counts") + + return tb + + +def add_count_years_in_regime( + tb: Table, + columns: List[Tuple[str, str, int]], + na_is_zero: bool = False, +) -> Table: + """Add years in a certain regime. + + Two types of counters are generated: + - Age: Number of years consecutively with a certain regime type. + - Experience: Number of years with a certain regime type. + + columns: List of tuples with 3 elements: (colname, col_newname, threshold). + na_is_zero: NaN values (i.e. can't be classified based on the threshold) are classified into the negative class (i.e. 0). + """ + + def _count_years_in_regime(tb, col, col_new, th, na_is_zero=False): + """Groups are created as (-inf, th] and (th, inf). + + If NaN is found, we assume that the country is not in the regime. + """ + col_th = "thresholded" + + tb[col_th] = pd.cut(tb[col], bins=[-float("inf"), th, float("inf")], labels=[0, 1]).astype("Int64") + if na_is_zero: + tb[col_th] = tb[col_th].fillna(0) + + # Add age of democracy + tb[f"age_{col_new}"] = tb.groupby(["country", tb[col_th].fillna(0).eq(0).cumsum()])[col_th].cumsum().astype(int) + tb[f"age_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col]) + # Add experience with democracy + tb[f"experience_{col_new}"] = tb.groupby("country")[col_th].cumsum().astype(int) + tb[f"experience_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col]) + # Sanity check + assert (tb.loc[tb[col_th] == 1, f"age_{col_new}"] != 0).all(), "Negative age found!" + assert (tb.loc[tb[col_th] == 1, f"experience_{col_new}"] != 0).all(), "Negative age found!" + # Drop unused columns + tb = tb.drop(columns=[col_th]) + return tb + + if columns: + for col in columns: + assert len(col) == 3, "Columns should be a list of tuples with 3 elements: (colname, col_newname, col_th)" + tb = _count_years_in_regime(tb, *col, na_is_zero=na_is_zero) + return tb + + +def add_age_groups( + tb: Table, + column: str, + column_raw: str, + threshold: int, + category_names: Dict[Any, str], + age_bins: List[int | float] | None = None, +) -> Table: + """Create category for `column`.""" + column_new = f"group_{column}" + + if age_bins is None: + age_bins = [0, 18, 30, 60, 90, float("inf")] + + # Create age group labels + assert len(age_bins) > 1, "There should be at least two age groups." + labels = [] + for i in range(len(age_bins) - 1): + labels.append(f"{age_bins[i]+1}-{age_bins[i+1]} years".replace("-inf", "+")) + + # Create variable for age group of electoral demcoracies + tb[column_new] = pd.cut( + tb[column], + bins=age_bins, + labels=labels, + ).astype("string") + + # Add additional categories + for regime_id, regime_name in category_names.items(): + if regime_id > threshold: + break + tb.loc[(tb[column_raw] == regime_id) & tb[column_new].isna(), column_new] = regime_name + + # Copy metadata + tb[column_new] = tb[column_new].copy_metadata(tb[column]) + return tb + + +def add_imputes( + tb: Table, + path: Path, + cols_verify: List[str] | None = None, + col_flag_imputed: str | None = None, + verify_integrity: bool = True, +) -> Table: + """Add imputed values to the table. + + Imputed values are inferred from historical equivalents. + + Example: Was "Eritrea" a democracy in 1993? + + - We can infer this from "Ethiopia (former)" (historical equivalent). You can see all these mappings in bmr.countries_impute.yml file. + + - This is useful to (i) be able to colour these world regions in grapher map charts, and (ii) to be able to count the number of people living in democracy (in `make_tables_population_counters`). + + - Note that these "imputed country values" are ignored when estimating the number of countries in democracies (function `make_tables_country_counters`), since these countries did not exist at the time! + """ + tb_ = tb.copy() + + if col_flag_imputed is None: + col_flag_imputed = "values_imputed" + + if cols_verify is None: + cols_verify = ["country", "year"] + + # Load impute data + countries_impute = yaml.safe_load(path.read_text()) + + # Drop known values that are not correct + + tb_imputed = [] + for impute in countries_impute: + # Get relevant rows + tb_imp_ = tb_.loc[ + (tb_["country"] == impute["country_impute"]) + & (tb_["year"] >= impute.get("year_min", 99999)) + & (tb_["year"] <= impute.get("year_max", -99999)) + ].copy() + # Sanity checks + assert tb_imp_.shape[0] > 0, f"No data found for {impute['country_impute']}" + assert tb_imp_["year"].max() == impute["year_max"], f"Missing years (max check) for {impute['country_impute']}" + assert (a := tb_imp_["year"].min()) == ( + b := impute["year_min"] + ), f"Missing years (min check) for {impute['country']} imputed from {impute['country_impute']}: {a} != {b}" + + # Tweak them + # tb_ = tb_.rename( + # columns={ + # "country": "regime_imputed_country", + # } + # ) + tb_imp_[col_flag_imputed] = True + + # Different behaviour depending whether we have a list of countries or a single country to impute + if isinstance(impute["country"], list): + for country in impute["country"]: + tb_imp_["country"] = country + tb_imputed.append(tb_imp_.copy()) + else: + tb_imp_["country"] = impute["country"] + tb_imputed.append(tb_imp_) + + tb_ = concat(tb_imputed + [tb_], ignore_index=True) + + # Set to False by default (for non-imputed countries) + tb_[col_flag_imputed] = tb_[col_flag_imputed].fillna(False).astype(bool) + + # Re-order columns + # cols = [ + # "country", + # "year", + # "regime", + # "regime_womsuffr", + # "regime_imputed_country", + # "regime_imputed", + # ] + # tb_ = cast(Table, tb_[cols]) + + # Verify that there are no duplicates + if verify_integrity: + tb_ = tb_.set_index(cols_verify, verify_integrity=True).sort_index().reset_index() + return tb_ diff --git a/etl/steps/data/grapher/democracy/2025-03-05/eiu.py b/etl/steps/data/grapher/democracy/2025-03-05/eiu.py new file mode 100644 index 00000000000..2c4f25b1160 --- /dev/null +++ b/etl/steps/data/grapher/democracy/2025-03-05/eiu.py @@ -0,0 +1,34 @@ +"""Load a garden dataset and create a grapher dataset.""" + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset. + ds_garden = paths.load_dataset("eiu") + + # + # Process data. + # + tables = [ + ds_garden["eiu"], + ds_garden["num_countries"], + ds_garden["num_people"], + ds_garden["avg_pop"], + ] + # + # Save outputs. + # + # Create a new grapher dataset with the same metadata as the garden dataset. + ds_grapher = create_dataset( + dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_garden.metadata + ) + + # Save changes in the new grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/democracy/2025-03-05/eiu.py b/etl/steps/data/meadow/democracy/2025-03-05/eiu.py new file mode 100644 index 00000000000..1e9b05f5941 --- /dev/null +++ b/etl/steps/data/meadow/democracy/2025-03-05/eiu.py @@ -0,0 +1,123 @@ +"""Load a snapshot and create a meadow dataset.""" + +from owid.catalog.tables import Table, concat + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + + # Retieve data from Gapminder + snap = paths.load_snapshot("eiu_gapminder.csv") + tb_gm = snap.read(safe_types=False) + + # Retrieve data from EIU (single year reports) + shortnames = [ + # "eiu_gapminder", + "eiu_2021", + "eiu_2022", + "eiu_2023", + ] + tbs = [] + for name in shortnames: + snap = paths.load_snapshot(f"{name}.csv") + tb = snap.read(safe_types=False) + tbs.append(tb) + + # Correct data by Gapminder + ## Gapminder multiplies all values by ten. + cols = [ + "democracy_eiu", + "elect_freefair_eiu", + "funct_gov_eiu", + "pol_part_eiu", + "dem_culture_eiu", + "civlib_eiu", + ] + tb_gm[cols] = tb_gm[cols] / 10 + + ## Add missing data + tb_gm = add_datapoints(tb_gm) + + # Concatenate all tables. + tbs.append(tb_gm) + tb = concat(tbs, ignore_index=True, short_name="eiu") + + # + # Process data. + # + tb = tb.rename( + columns={ + "country_name": "country", + } + ) + + tb["rank_eiu"] = tb["rank_eiu"].str.replace("=", "") + tb["rank_eiu"] = tb["rank_eiu"].astype("float") + + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + + # Save changes in the new meadow dataset. + ds_meadow.save() + + +def add_datapoints(tb: Table) -> Table: + """Add missing datapoints in Gapminder data.""" + # Define records + records = [ + { + "country_name": "Algeria", + "democracy_eiu": 3.77, + "elect_freefair_eiu": 3.08, + "funct_gov_eiu": 2.5, + "pol_part_eiu": 4.44, + "dem_culture_eiu": 5, + "civlib_eiu": 3.82, + }, + { + "country_name": "Iran", + "democracy_eiu": 2.2, + "elect_freefair_eiu": 0, + "funct_gov_eiu": 2.5, + "pol_part_eiu": 3.89, + "dem_culture_eiu": 3.13, + "civlib_eiu": 1.47, + }, + { + "country_name": "Lithuania", + "democracy_eiu": 7.13, + "elect_freefair_eiu": 9.58, + "funct_gov_eiu": 6.07, + "pol_part_eiu": 5.56, + "dem_culture_eiu": 5.63, + "civlib_eiu": 8.82, + }, + { + "country_name": "Ukraine", + "democracy_eiu": 5.81, + "elect_freefair_eiu": 8.25, + "funct_gov_eiu": 2.71, + "pol_part_eiu": 7.22, + "dem_culture_eiu": 5, + "civlib_eiu": 5.88, + }, + ] + tb_ext = Table.from_records(records).assign(year=2020) + + # Add to main table + tb = concat([tb, tb_ext], ignore_index=True, short_name=tb.m.short_name) + + return tb From 44527391e4de92aeb6bebe8ff3986f3d04b147cc Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 22:28:27 +0100 Subject: [PATCH 04/10] add quotation mark for snapshot name --- ...ookiecutter.short_name}}.{{cookiecutter.file_extension}}.dvc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.{{cookiecutter.file_extension}}.dvc b/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.{{cookiecutter.file_extension}}.dvc index 7c1cb910983..71748b284fc 100644 --- a/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.{{cookiecutter.file_extension}}.dvc +++ b/apps/wizard/etl_steps/cookiecutter/snapshot/{{cookiecutter.namespace}}/{{cookiecutter.snapshot_version}}/{{cookiecutter.short_name}}.{{cookiecutter.file_extension}}.dvc @@ -3,7 +3,7 @@ meta: origin: # Data product / Snapshot - title: {{cookiecutter.title}} + title: "{{cookiecutter.title}}" {%- if cookiecutter.description %} description: |- {{cookiecutter.description.replace("\n", "\n ")}} From 146e27febd61373f1d06b0a3e34997521bca61d6 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 22:28:47 +0100 Subject: [PATCH 05/10] add snapshot --- .../democracy/2025-03-05/eiu_2024.csv.dvc | 28 +++++++++++++++++ snapshots/democracy/2025-03-05/eiu_2024.py | 30 +++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 snapshots/democracy/2025-03-05/eiu_2024.csv.dvc create mode 100644 snapshots/democracy/2025-03-05/eiu_2024.py diff --git a/snapshots/democracy/2025-03-05/eiu_2024.csv.dvc b/snapshots/democracy/2025-03-05/eiu_2024.csv.dvc new file mode 100644 index 00000000000..fa4c26c9400 --- /dev/null +++ b/snapshots/democracy/2025-03-05/eiu_2024.csv.dvc @@ -0,0 +1,28 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: "Democracy Index 2024: What's wrong with representative democracy?" + description: |- + The Economist Intelligence Unit's Democracy Index provides a snapshot of the state of democracy in 165 independent states and two territories. This covers almost the entire population of the world and the vast majority of the world's states (microstates are excluded). Scored on a 0-10 scale, the Democracy Index is based on five categories: electoral process and pluralism, functioning of government, political participation, political culture, and civil liberties. Based on its scores on a range of indicators within these categories, each country is classified as one of four types of regime: "full democracy", "flawed democracy", "hybrid regime" or "authoritarian regime". + + This edition of the Democracy Index examines the state of global democracy in 2024. The focus of this year's report is why representative democracy is not working for large numbers of citizens around the world. There is a growing consensus that the democratic model developed over the past century is in trouble, but there is less clarity about why people are so disenchanted with their democracies. In 2024, when countries inhabited by more than half of the global population went to the polls, popular disaffection with the performance of government was expressed in an anti-incumbent backlash and rising support for populist insurgents. + date_published: "2025-02-27" + # Citation + producer: Economist Intelligence Unit + citation_full: |- + Economist Intelligence Unit. (2025, February 27). Democracy Index 2024: What's wrong with representative democracy? Retrieved from Economist Group. + + # Files + url_main: https://www.eiu.com/n/campaigns/democracy-index-2024/ + date_accessed: 2025-03-05 + + # License + license: + name: The Economist Intelligence Unit Limited + url: https://www.eiu.com/n/terms/ +outs: + - md5: 0c518b869c0f2aca57ec6f65b45351bb + size: 7005 + path: eiu_2024.csv diff --git a/snapshots/democracy/2025-03-05/eiu_2024.py b/snapshots/democracy/2025-03-05/eiu_2024.py new file mode 100644 index 00000000000..99d612360d5 --- /dev/null +++ b/snapshots/democracy/2025-03-05/eiu_2024.py @@ -0,0 +1,30 @@ +"""The data from the EIU is not shared in a machine-readable format. Instead, the EIU shares a single-year PDF report every year. + +To overcome this we snapshot yearly reports (see snapshots/democracy/2024-05-22/eiu_dem_index.py for more details). + +All these trancriptions and imports are saved in a Google sheet: https://docs.google.com/spreadsheets/d/1902iwPdR-PKjmpONceb1u9h2GzR-9Kzac4C9cnNDcHo/edit?usp=sharing. +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +@click.option("--path-to-file", "-f", prompt=True, type=str, help="Path to local data file.") +def main(path_to_file: str, upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"democracy/{SNAPSHOT_VERSION}/eiu_2024.csv") + + # Copy local data file to snapshots data folder, add file to DVC and upload to S3. + snap.create_snapshot(filename=path_to_file, upload=upload) + + +if __name__ == "__main__": + main() From 70b3bddfe4886ac9e43ffb0869352bd35e4d5a5a Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 23:06:15 +0100 Subject: [PATCH 06/10] add missing column in snapshot --- snapshots/democracy/2025-03-05/eiu_2024.csv.dvc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshots/democracy/2025-03-05/eiu_2024.csv.dvc b/snapshots/democracy/2025-03-05/eiu_2024.csv.dvc index fa4c26c9400..02f7bdab0c2 100644 --- a/snapshots/democracy/2025-03-05/eiu_2024.csv.dvc +++ b/snapshots/democracy/2025-03-05/eiu_2024.csv.dvc @@ -23,6 +23,6 @@ meta: name: The Economist Intelligence Unit Limited url: https://www.eiu.com/n/terms/ outs: - - md5: 0c518b869c0f2aca57ec6f65b45351bb - size: 7005 + - md5: 0bd087f70b3c7c92ee40ca2749cd9ed8 + size: 7836 path: eiu_2024.csv From 703409d8ea41b6d9a77c735dbdb111e8b372bc16 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 23:06:25 +0100 Subject: [PATCH 07/10] meadow --- .../data/meadow/democracy/2025-03-05/eiu.py | 50 ++++++++++++------- 1 file changed, 32 insertions(+), 18 deletions(-) diff --git a/etl/steps/data/meadow/democracy/2025-03-05/eiu.py b/etl/steps/data/meadow/democracy/2025-03-05/eiu.py index 1e9b05f5941..dd130af6e7a 100644 --- a/etl/steps/data/meadow/democracy/2025-03-05/eiu.py +++ b/etl/steps/data/meadow/democracy/2025-03-05/eiu.py @@ -2,27 +2,26 @@ from owid.catalog.tables import Table, concat -from etl.helpers import PathFinder, create_dataset +from etl.helpers import PathFinder # Get paths and naming conventions for current step. paths = PathFinder(__file__) -def run(dest_dir: str) -> None: +def run() -> None: # # Load inputs. # - - # Retieve data from Gapminder + # Retrieve data from Gapminder snap = paths.load_snapshot("eiu_gapminder.csv") tb_gm = snap.read(safe_types=False) # Retrieve data from EIU (single year reports) shortnames = [ - # "eiu_gapminder", "eiu_2021", "eiu_2022", "eiu_2023", + "eiu_2024", ] tbs = [] for name in shortnames: @@ -31,18 +30,9 @@ def run(dest_dir: str) -> None: tbs.append(tb) # Correct data by Gapminder - ## Gapminder multiplies all values by ten. - cols = [ - "democracy_eiu", - "elect_freefair_eiu", - "funct_gov_eiu", - "pol_part_eiu", - "dem_culture_eiu", - "civlib_eiu", - ] - tb_gm[cols] = tb_gm[cols] / 10 + tb_gm = scale_indicators_gm(tb_gm) - ## Add missing data + ## Add missing data in Gapminder tb_gm = add_datapoints(tb_gm) # Concatenate all tables. @@ -52,14 +42,19 @@ def run(dest_dir: str) -> None: # # Process data. # + # Rename country column tb = tb.rename( columns={ "country_name": "country", } ) + # Drop rows if country is NA + tb = tb.dropna(subset=["country"]) + + # Fix type of rank tb["rank_eiu"] = tb["rank_eiu"].str.replace("=", "") - tb["rank_eiu"] = tb["rank_eiu"].astype("float") + tb["rank_eiu"] = tb["rank_eiu"].astype("UInt16") # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. tb = tb.format(["country", "year"]) @@ -68,12 +63,31 @@ def run(dest_dir: str) -> None: # Save outputs. # # Create a new meadow dataset with the same metadata as the snapshot. - ds_meadow = create_dataset(dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=snap.metadata) + ds_meadow = paths.create_dataset( + tables=[tb], + check_variables_metadata=True, + default_metadata=snap.metadata, + ) # Save changes in the new meadow dataset. ds_meadow.save() +def scale_indicators_gm(tb): + """Gapminder multiplies all values by ten.""" + cols = [ + "democracy_eiu", + "elect_freefair_eiu", + "funct_gov_eiu", + "pol_part_eiu", + "dem_culture_eiu", + "civlib_eiu", + ] + tb[cols] = tb[cols] / 10 + + return tb + + def add_datapoints(tb: Table) -> Table: """Add missing datapoints in Gapminder data.""" # Define records From ce11445097517103659e20352d99365c66a1d249 Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 23:06:38 +0100 Subject: [PATCH 08/10] garden --- .../garden/democracy/2025-03-05/eiu.meta.yml | 14 +- .../data/garden/democracy/2025-03-05/eiu.py | 30 +- etl/steps/data/garden/democracy/shared.py | 489 ++++++++++++++++++ 3 files changed, 509 insertions(+), 24 deletions(-) create mode 100644 etl/steps/data/garden/democracy/shared.py diff --git a/etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml b/etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml index 320997b8c70..34581ce3b52 100644 --- a/etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml +++ b/etl/steps/data/garden/democracy/2025-03-05/eiu.meta.yml @@ -5,26 +5,23 @@ definitions: topic_tags: - Democracy - # Learn more about the available fields: # http://docs.owid.io/projects/etl/architecture/metadata/reference/ dataset: update_period_days: 365 - title: EIU Democratic Index (2006-2023) + title: EIU Democratic Index (2006-2024) description: |- The Economist Intelligence Unit's (EIU) Democracy Index provides a snapshot of the state of democracy in 165 independent states and two territories. This covers almost the entire population of the world and the vast majority of the world's states (microstates are excluded). Scored on a 0-10 scale, the Democracy Index is based on five categories: electoral process and pluralism, functioning of government, political participation, political culture, and civil liberties. Based on its scores on a range of indicators within these categories, each country is classified as one of four types of regime: "full democracy", "flawed democracy", "hybrid regime" or "authoritarian regime". This dataset was constructed using data collected from various EIU yearly reports and a multi-year compilation by Gapminder. - tables: eiu: common: presentation: - attribution: Economist Intelligence Unit (2006-2023) + attribution: Economist Intelligence Unit (2006-2024) variables: - democracy_eiu: title: Democracy score description_short: |- @@ -77,7 +74,7 @@ tables: num_countries: common: presentation: - attribution: Economist Intelligence Unit (2006-2023) + attribution: Economist Intelligence Unit (2006-2024) variables: num_regime_eiu: @@ -92,7 +89,7 @@ tables: num_people: common: presentation: - attribution: Economist Intelligence Unit (2006-2023); Population based on various sources (2023) + attribution: Economist Intelligence Unit (2006-2024); Population based on various sources (2024) variables: pop_regime_eiu: title: |- @@ -104,11 +101,10 @@ tables: description_short: "Number of countries with available data." unit: "people" - avg_pop: common: presentation: - attribution: Economist Intelligence Unit (2006-2023); Population based on various sources (2023) + attribution: Economist Intelligence Unit (2006-2024); Population based on various sources (2024) variables: democracy_eiu_weighted: title: "Democracy score (population-weighted)" diff --git a/etl/steps/data/garden/democracy/2025-03-05/eiu.py b/etl/steps/data/garden/democracy/2025-03-05/eiu.py index 5c67de93a7a..c919bd94e6f 100644 --- a/etl/steps/data/garden/democracy/2025-03-05/eiu.py +++ b/etl/steps/data/garden/democracy/2025-03-05/eiu.py @@ -1,12 +1,15 @@ """Load a meadow dataset and create a garden dataset.""" -from typing import Tuple, cast +from typing import Tuple import numpy as np import pandas as pd from owid.catalog import Dataset, Table from owid.catalog.tables import concat -from shared import ( + +from etl.data_helpers import geo +from etl.helpers import PathFinder +from etl.steps.data.garden.democracy.shared import ( add_population_in_dummies, add_regions_and_global_aggregates, expand_observations, @@ -14,9 +17,6 @@ make_table_with_dummies, ) -from etl.data_helpers import geo -from etl.helpers import PathFinder, create_dataset - # Get paths and naming conventions for current step. paths = PathFinder(__file__) # Missing classifications of states @@ -30,10 +30,10 @@ } # Year range YEAR_MIN = 2006 -YEAR_MAX = 2023 +YEAR_MAX = 2025 -def run(dest_dir: str) -> None: +def run() -> None: # # Load inputs. # @@ -43,7 +43,7 @@ def run(dest_dir: str) -> None: ds_population = paths.load_dataset("population") # Read table from meadow dataset. - tb = ds_meadow["eiu"].reset_index() + tb = ds_meadow.read("eiu") # # Process data. @@ -54,12 +54,12 @@ def run(dest_dir: str) -> None: ) # Remove years with interpolated data (2007 and 2009 are interpolated by Gapminder) - tb = tb[~tb["year"].isin([2007, 2009])] + tb = tb.loc[~tb["year"].isin([2007, 2009])] # Drop rank column tb = tb.drop(columns=["rank_eiu"]) - tb = cast(Table, tb) + # Add regime identifier tb = add_regime_identifier(tb) ################################################## @@ -67,7 +67,7 @@ def run(dest_dir: str) -> None: # Get country-count-related data: country-averages, number of countries, ... tb_num_countries, tb_avg_countries = get_country_data(tb, ds_regions) - # Get population-related data: population-weighed averages, people livin in ... + # Get population-related data: population-weighed averages, people living in ... tb_num_people, tb_avg_w_countries = get_population_data(tb, ds_regions, ds_population) ################################################## @@ -85,8 +85,10 @@ def run(dest_dir: str) -> None: ] # Create a new garden dataset with the same metadata as the meadow dataset. - ds_garden = create_dataset( - dest_dir, tables=tables, check_variables_metadata=True, default_metadata=ds_meadow.metadata + ds_garden = paths.create_dataset( + tables=tables, + check_variables_metadata=True, + default_metadata=ds_meadow.metadata, ) # Save changes in the new garden dataset. @@ -142,7 +144,6 @@ def get_country_data(tb: Table, ds_regions: Dataset) -> Tuple[Table, Table]: "regime_eiu": "Int64", } ) - tb_num = cast(Table, tb_num) # Define columns on which we will estimate (i) "number of countries" and (ii) "number of people living in ..." indicators = [ @@ -212,7 +213,6 @@ def get_population_data(tb: Table, ds_regions: Dataset, ds_population: Dataset) "regime_eiu": "Int64", } ) - tb_ppl = cast(Table, tb_ppl) indicators = [ { diff --git a/etl/steps/data/garden/democracy/shared.py b/etl/steps/data/garden/democracy/shared.py new file mode 100644 index 00000000000..9bcc0b6413a --- /dev/null +++ b/etl/steps/data/garden/democracy/shared.py @@ -0,0 +1,489 @@ +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, cast + +import numpy as np +import pandas as pd +import yaml +from owid.catalog import Dataset, Table +from owid.catalog.tables import concat + +from etl.data_helpers import geo + +SEPARATOR = "." +# REGION AGGREGATES +REGIONS = { + "Africa": { + "additional_members": [ + "Somaliland", + "Zanzibar", + ] + }, + "Asia": { + "additional_members": [ + "Palestine/Gaza", + "Palestine/West Bank", + ] + }, + "North America": {}, + "South America": {}, + "Europe": { + "additional_members": [ + "Baden", + "Bavaria", + "Brunswick", + "Duchy of Nassau", + "Hamburg", + "Hanover", + "Hesse Electoral", + "Hesse Grand Ducal", + "Mecklenburg Schwerin", + "Modena", + "Oldenburg", + "Parma", + "Piedmont-Sardinia", + "Saxe-Weimar-Eisenach", + "Saxony", + "Tuscany", + "Two Sicilies", + "Wurttemberg", + ] + }, + "Oceania": {}, +} + + +def from_wide_to_long( + tb: Table, + indicator_name_callback: Optional[Callable] = None, + indicator_category_callback: Optional[Callable] = None, + column_dimension_name: str = "category", + separator: str = SEPARATOR, +) -> Table: + """Format a particular shape of table from wide to long format. + + tb: Table with wide format. + indicator_name_callback: Function to extract the indicator name from the column name. + indicator_category_callback: Function to extract the indicator category from the column name. + + If no `indicator_name_callback` and `indicator_category_callback` are provided, it proceed expects the following input: + + | year | country | indicator_a_1 | indicator_a_2 | indicator_b_1 | indicator_b_2 | + |------|---------|---------------|---------------|---------------|---------------| + | 2000 | USA | 1 | 2 | 3 | 4 | + | 2000 | CAN | 5 | 6 | 7 | 8 | + + and then generates the output: + + | year | country | category | indicator_a | indicator_b | + |------|---------|------------|-------------|-------------| + | 2000 | USA | category_1 | 1 | 3 | + | 2000 | USA | category_2 | 2 | 4 | + """ + tb_ = tb.copy() + + # Melt the DataFrame to long format + tb_ = tb_.melt(id_vars=["year", "country"], var_name="indicator_type", value_name="value") + + # Get callables + if indicator_name_callback is None: + + def default_indicator_name(x): + assert len(x.split(separator)) == 2 + return x.split(separator)[0] + + indicator_name_callback = default_indicator_name + + if indicator_category_callback is None: + + def default_indicator_category(x): + assert len(x.split(separator)) == 2 + return x.split(separator)[-1] + + indicator_category_callback = default_indicator_category + + # Extract indicator names and types + tb_["indicator"] = tb_["indicator_type"].apply(indicator_name_callback) + tb_[column_dimension_name] = tb_["indicator_type"].apply(indicator_category_callback) + + # Drop the original 'indicator_type' column as it's no longer needed + tb_.drop("indicator_type", axis=1, inplace=True) + + # Pivot the table to get 'indicator_a' and 'indicator_b' as separate columns + tb_ = tb_.pivot(index=["year", "country", column_dimension_name], columns="indicator", values="value").reset_index() + + # Rename the columns to match your requirements + tb_.columns.name = None # Remove the hierarchy + + return tb_ + + +def expand_observations(tb: Table, regions: Set | None = None) -> Table: + """Expand to have a row per (year, country).""" + # Add missing years for each triplet ("warcode", "campcode", "ccode") + + # List of countries + if regions is None: + regions = set(tb["country"]) + + # List of possible years + years = np.arange(tb["year"].min(), tb["year"].max() + 1) + + # New index + new_idx = pd.MultiIndex.from_product([years, regions], names=["year", "country"]) + + # Reset index + tb = tb.set_index(["year", "country"]).reindex(new_idx).reset_index() + + # Type of `year` + tb["year"] = tb["year"].astype("int") + return tb + + +def add_population_in_dummies( + tb: Table, + ds_population: Dataset, + expected_countries_without_population: Optional[List[str]] = None, + drop_population: bool = True, +): + # Add population column + tb = geo.add_population_to_table( + tb, + ds_population, + interpolate_missing_population=True, + expected_countries_without_population=expected_countries_without_population, + ) + tb = cast(Table, tb.dropna(subset="population")) + # Add metadata (origins combined indicator+population) + cols = [col for col in tb.columns if col not in ["year", "country", "population"]] + meta = {col: tb[col].metadata for col in cols} | {"population": tb["population"].metadata} + ## Encode population in indicators: Population if 1, 0 otherwise + tb[cols] = tb[cols].multiply(tb["population"], axis=0) + if drop_population: + tb = tb.drop(columns="population") + ## Add metadata back (combine origins from population) + for col in cols: + metadata = meta[col] + metadata.origins += meta["population"].origins + tb[col].metadata = meta[col] + + return tb + + +def make_table_with_dummies( + tb: Table, + indicators: List[Dict[str, Any]], + separator: str = SEPARATOR, +) -> Table: + """Format table to have dummy indicators. + + From a table with categorical indicators, create a new table with dummy indicator for each indicator-category pair. + + Example input: + + | year | country | regime | regime_amb | + |------|---------|-----------|------------| + | 2000 | USA | 1 | 0 | + | 2000 | CAN | 0 | 1 | + | 2000 | DEU | NaN | NaN | + + + Example output: + + | year | country | regime.0 | regime.1 | regime.-1 | regime_amb.0 | regime_amb.0 | regime_amb.-1 | + |------|---------|----------|----------|-----------|--------------|--------------|---------------| + | 2000 | USA | 0 | 1 | 0 | 1 | 0 | 0 | + | 2000 | CAN | 1 | 0 | 0 | 0 | 1 | 0 | + | 2000 | DEU | 0 | 0 | 1 | 0 | 0 | 1 | + + Note that '-1' denotes NA (missing value) category. + + The argument `indicators` contains the indicators for which we will create dummies, along with other associated paramters. Example: + + { + "name": "regime_amb_row_owid", + "name_new": "num_countries_regime_amb", + # "values_expected": set(map(str, range(10))), + "values_expected": { + "0": "closed autocracy", + "1": "closed (maybe electoral) autocracy", + "2": "electoral (maybe closed) autocracy", + "3": "electoral autocracy", + "4": "electoral autocracy (maybe electoral democracy)", + "5": "electoral democracy (maybe electoral autocracy)", + "6": "electoral democracy", + "7": "electoral democracy (maybe liberal democracy)", + "8": "liberal democracy (maybe electoral democracy)", + "9": "liberal democracy", + }, + "has_na": True, + } + """ + tb_ = tb.copy() + + # Convert to string + indicator_names = [indicator["name"] for indicator in indicators] + tb_[indicator_names] = tb_[indicator_names].astype("string") + + # Sanity check that the categories for each indicator are as expected + for indicator in indicators: + values_expected = indicator["values_expected"] + # Check and fix NA (convert NAs to -1 category) + if indicator["has_na"]: + # Assert that there are actually NaNs + assert tb_[indicator["name"]].isna().any(), f"No NA found in {indicator['name']}!" + # If NA, we should not have category '-1', otherwise these would get merged! + assert "-1" not in set( + tb_[indicator["name"]].unique() + ), f"Error for indicator `{indicator['name']}`. Found -1, which is not allowed when `has_na=True`!" + tb_[indicator["name"]] = tb_[indicator["name"]].fillna("-1") + # Add '-1' as a possible category + if isinstance(values_expected, dict): + indicator["values_expected"]["-1"] = "-1" + else: + values_expected |= {"-1"} + else: + assert not tb_[indicator["name"]].isna().any(), f"NA found in {indicator['name']}!" + + values_found = set(tb_[indicator["name"]].unique()) + assert values_found == set( + values_expected + ), f"Error for indicator `{indicator['name']}`. Expected {set(values_expected)} but found {values_found}" + + # Rename dimension values + if isinstance(values_expected, dict): + tb_[indicator["name"]] = tb_[indicator["name"]].map(indicator["values_expected"]) + + ## Rename columns + tb_ = tb_.rename( + columns={indicator["name"]: indicator.get("name_new", indicator["name"]) for indicator in indicators} + ) + indicator_names = [indicator.get("name_new", indicator["name"]) for indicator in indicators] + + ## Get dummy indicator table + tb_ = cast(Table, pd.get_dummies(tb_, dummy_na=True, columns=indicator_names, dtype=int, prefix_sep=separator)) + + ## Add missing metadata to dummy indicators + dummy_cols = [] + for indicator in indicators: + name_new = indicator.get("name_new", indicator["name"]) + ## get list of dummy indicator column names + if isinstance(indicator["values_expected"], dict): + dummy_columns = [f"{name_new}{separator}{v}" for v in indicator["values_expected"].values()] + else: + dummy_columns = [f"{name_new}{separator}{v}" for v in indicator["values_expected"]] + ## assign metadata to dummy column indicators + for col in dummy_columns: + tb_[col].metadata = tb[indicator["name"]].metadata + dummy_cols.extend(dummy_columns) + + ### Select subset of columns + tb_ = tb_.loc[:, ["year", "country"] + dummy_cols] + + return tb_ + + +def add_regions_and_global_aggregates( + tb: Table, + ds_regions: Dataset, + regions: Optional[Dict[str, Any]] = None, + aggregations: Optional[Dict[str, str]] = None, + min_num_values_per_year: Optional[int] = None, + aggregations_world: Optional[Dict[str, str]] = None, + short_name: str = "region_counts", +) -> Table: + """Add regions, and world aggregates.""" + # Copy + tb_ = tb.copy() + + # Regions considered + if regions is None: + regions = REGIONS + + # Add regions + tb_regions = geo.add_regions_to_table( + tb_.copy(), + ds_regions, + regions=regions, + aggregations=aggregations, + min_num_values_per_year=min_num_values_per_year, + ) + tb_regions = tb_regions.loc[tb_regions["country"].isin(regions.keys())] + + # Add world + if aggregations_world is None: + tb_world = tb.groupby("year", as_index=False).sum(numeric_only=True, min_count=1).assign(country="World") + else: + tb_world = tb.groupby("year", as_index=False).agg(aggregations_world).assign(country="World") + tb = concat([tb_regions, tb_world], ignore_index=True, short_name="region_counts") + + return tb + + +def add_count_years_in_regime( + tb: Table, + columns: List[Tuple[str, str, int]], + na_is_zero: bool = False, +) -> Table: + """Add years in a certain regime. + + Two types of counters are generated: + - Age: Number of years consecutively with a certain regime type. + - Experience: Number of years with a certain regime type. + + columns: List of tuples with 3 elements: (colname, col_newname, threshold). + na_is_zero: NaN values (i.e. can't be classified based on the threshold) are classified into the negative class (i.e. 0). + """ + + def _count_years_in_regime(tb, col, col_new, th, na_is_zero=False): + """Groups are created as (-inf, th] and (th, inf). + + If NaN is found, we assume that the country is not in the regime. + """ + col_th = "thresholded" + + tb[col_th] = pd.cut(tb[col], bins=[-float("inf"), th, float("inf")], labels=[0, 1]).astype("Int64") + if na_is_zero: + tb[col_th] = tb[col_th].fillna(0) + + # Add age of democracy + tb[f"age_{col_new}"] = tb.groupby(["country", tb[col_th].fillna(0).eq(0).cumsum()])[col_th].cumsum().astype(int) + tb[f"age_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col]) + # Add experience with democracy + tb[f"experience_{col_new}"] = tb.groupby("country")[col_th].cumsum().astype(int) + tb[f"experience_{col_new}"] = tb[f"age_{col_new}"].copy_metadata(tb[col]) + # Sanity check + assert (tb.loc[tb[col_th] == 1, f"age_{col_new}"] != 0).all(), "Negative age found!" + assert (tb.loc[tb[col_th] == 1, f"experience_{col_new}"] != 0).all(), "Negative age found!" + # Drop unused columns + tb = tb.drop(columns=[col_th]) + return tb + + if columns: + for col in columns: + assert len(col) == 3, "Columns should be a list of tuples with 3 elements: (colname, col_newname, col_th)" + tb = _count_years_in_regime(tb, *col, na_is_zero=na_is_zero) + return tb + + +def add_age_groups( + tb: Table, + column: str, + column_raw: str, + threshold: int, + category_names: Dict[Any, str], + age_bins: List[int | float] | None = None, +) -> Table: + """Create category for `column`.""" + column_new = f"group_{column}" + + if age_bins is None: + age_bins = [0, 18, 30, 60, 90, float("inf")] + + # Create age group labels + assert len(age_bins) > 1, "There should be at least two age groups." + labels = [] + for i in range(len(age_bins) - 1): + labels.append(f"{age_bins[i]+1}-{age_bins[i+1]} years".replace("-inf", "+")) + + # Create variable for age group of electoral demcoracies + tb[column_new] = pd.cut( + tb[column], + bins=age_bins, + labels=labels, + ).astype("string") + + # Add additional categories + for regime_id, regime_name in category_names.items(): + if regime_id > threshold: + break + tb.loc[(tb[column_raw] == regime_id) & tb[column_new].isna(), column_new] = regime_name + + # Copy metadata + tb[column_new] = tb[column_new].copy_metadata(tb[column]) + return tb + + +def add_imputes( + tb: Table, + path: Path, + cols_verify: List[str] | None = None, + col_flag_imputed: str | None = None, + verify_integrity: bool = True, +) -> Table: + """Add imputed values to the table. + + Imputed values are inferred from historical equivalents. + + Example: Was "Eritrea" a democracy in 1993? + + - We can infer this from "Ethiopia (former)" (historical equivalent). You can see all these mappings in bmr.countries_impute.yml file. + + - This is useful to (i) be able to colour these world regions in grapher map charts, and (ii) to be able to count the number of people living in democracy (in `make_tables_population_counters`). + + - Note that these "imputed country values" are ignored when estimating the number of countries in democracies (function `make_tables_country_counters`), since these countries did not exist at the time! + """ + tb_ = tb.copy() + + if col_flag_imputed is None: + col_flag_imputed = "values_imputed" + + if cols_verify is None: + cols_verify = ["country", "year"] + + # Load impute data + countries_impute = yaml.safe_load(path.read_text()) + + # Drop known values that are not correct + + tb_imputed = [] + for impute in countries_impute: + # Get relevant rows + tb_imp_ = tb_.loc[ + (tb_["country"] == impute["country_impute"]) + & (tb_["year"] >= impute.get("year_min", 99999)) + & (tb_["year"] <= impute.get("year_max", -99999)) + ].copy() + # Sanity checks + assert tb_imp_.shape[0] > 0, f"No data found for {impute['country_impute']}" + assert tb_imp_["year"].max() == impute["year_max"], f"Missing years (max check) for {impute['country_impute']}" + assert (a := tb_imp_["year"].min()) == ( + b := impute["year_min"] + ), f"Missing years (min check) for {impute['country']} imputed from {impute['country_impute']}: {a} != {b}" + + # Tweak them + # tb_ = tb_.rename( + # columns={ + # "country": "regime_imputed_country", + # } + # ) + tb_imp_[col_flag_imputed] = True + + # Different behaviour depending whether we have a list of countries or a single country to impute + if isinstance(impute["country"], list): + for country in impute["country"]: + tb_imp_["country"] = country + tb_imputed.append(tb_imp_.copy()) + else: + tb_imp_["country"] = impute["country"] + tb_imputed.append(tb_imp_) + + tb_ = concat(tb_imputed + [tb_], ignore_index=True) + + # Set to False by default (for non-imputed countries) + tb_[col_flag_imputed] = tb_[col_flag_imputed].fillna(False).astype(bool) + + # Re-order columns + # cols = [ + # "country", + # "year", + # "regime", + # "regime_womsuffr", + # "regime_imputed_country", + # "regime_imputed", + # ] + # tb_ = cast(Table, tb_[cols]) + + # Verify that there are no duplicates + if verify_integrity: + tb_ = tb_.set_index(cols_verify, verify_integrity=True).sort_index().reset_index() + return tb_ From 84a8d2732d27e9bde88645f9dab86e2a17b340ce Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Wed, 5 Mar 2025 23:07:40 +0100 Subject: [PATCH 09/10] update dag --- dag/archive/democracy.yml | 13 +++++++++++++ dag/democracy.yml | 19 ++++--------------- 2 files changed, 17 insertions(+), 15 deletions(-) create mode 100644 dag/archive/democracy.yml diff --git a/dag/archive/democracy.yml b/dag/archive/democracy.yml new file mode 100644 index 00000000000..f23f60af335 --- /dev/null +++ b/dag/archive/democracy.yml @@ -0,0 +1,13 @@ +steps: + # EIU (2024) + data://meadow/democracy/2024-05-22/eiu: + - snapshot://democracy/2024-05-22/eiu_gapminder.csv + - snapshot://democracy/2024-05-22/eiu_2021.csv + - snapshot://democracy/2024-05-22/eiu_2022.csv + - snapshot://democracy/2024-05-22/eiu_2023.csv + data://garden/democracy/2024-03-07/eiu: + - data://meadow/democracy/2024-05-22/eiu + - data://garden/regions/2023-01-01/regions + - data://garden/demography/2023-03-31/population + data://grapher/democracy/2024-05-22/eiu: + - data://garden/democracy/2024-03-07/eiu diff --git a/dag/democracy.yml b/dag/democracy.yml index f5a97e6fa76..0ab96352f92 100644 --- a/dag/democracy.yml +++ b/dag/democracy.yml @@ -76,27 +76,16 @@ steps: data://grapher/democracy/2024-05-22/claassen_satisfaction: - data://garden/democracy/2024-03-07/claassen_satisfaction - # EIU (2024) - data://meadow/democracy/2024-05-22/eiu: - - snapshot://democracy/2024-05-22/eiu_gapminder.csv - - snapshot://democracy/2024-05-22/eiu_2021.csv - - snapshot://democracy/2024-05-22/eiu_2022.csv - - snapshot://democracy/2024-05-22/eiu_2023.csv - data://garden/democracy/2024-03-07/eiu: - - data://meadow/democracy/2024-05-22/eiu - - data://garden/regions/2023-01-01/regions - - data://garden/demography/2023-03-31/population - data://grapher/democracy/2024-05-22/eiu: - - data://garden/democracy/2024-03-07/eiu - # EIU (2024) + # EIU (2025) data://meadow/democracy/2025-03-05/eiu: - snapshot://democracy/2024-05-22/eiu_gapminder.csv - - snapshot://democracy/2024-05-22/eiu_2023.csv - snapshot://democracy/2024-05-22/eiu_2021.csv - snapshot://democracy/2024-05-22/eiu_2022.csv + - snapshot://democracy/2024-05-22/eiu_2023.csv + - snapshot://democracy/2025-03-05/eiu_2024.csv data://garden/democracy/2025-03-05/eiu: - - data://garden/demography/2024-07-15/population - data://meadow/democracy/2025-03-05/eiu + - data://garden/demography/2024-07-15/population - data://garden/regions/2023-01-01/regions data://grapher/democracy/2025-03-05/eiu: - data://garden/democracy/2025-03-05/eiu From 967353c6ddfb331e5c8a4e09545f4c2037afdcbf Mon Sep 17 00:00:00 2001 From: lucasrodes Date: Thu, 6 Mar 2025 00:06:15 +0100 Subject: [PATCH 10/10] ignore only interpolated years --- etl/steps/data/garden/democracy/2025-03-05/eiu.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/etl/steps/data/garden/democracy/2025-03-05/eiu.py b/etl/steps/data/garden/democracy/2025-03-05/eiu.py index c919bd94e6f..883cfe978a2 100644 --- a/etl/steps/data/garden/democracy/2025-03-05/eiu.py +++ b/etl/steps/data/garden/democracy/2025-03-05/eiu.py @@ -31,6 +31,9 @@ # Year range YEAR_MIN = 2006 YEAR_MAX = 2025 +# Years with interpolated values +## Gapminder interpolates 2007 and 2009. We rather drop datapoints for these years. +YEAR_INTERPOLATED = [2007, 2009] def run() -> None: @@ -54,7 +57,7 @@ def run() -> None: ) # Remove years with interpolated data (2007 and 2009 are interpolated by Gapminder) - tb = tb.loc[~tb["year"].isin([2007, 2009])] + tb = tb.loc[~tb["year"].isin(YEAR_INTERPOLATED)] # Drop rank column tb = tb.drop(columns=["rank_eiu"]) @@ -292,6 +295,6 @@ def expand_observations_without_duplicates(tb: Table, ds_regions: Dataset) -> Ta tb_exp = expand_observations(tb, countries) # Limit years - tb_exp = tb_exp.loc[tb_exp["year"].isin(range(YEAR_MIN, YEAR_MAX + 1, 2))] + tb_exp = tb_exp.loc[~tb_exp["year"].isin(YEAR_INTERPOLATED)] return tb_exp