From 5fc2424adeec6f10afce22d744103521ff98f35a Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Tue, 4 Mar 2025 17:44:24 +0100 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=8A=20survey:=20Update=20dietary=20cho?= =?UTF-8?q?ices=20of=20Brits=20data=20(#4069)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 📊 survey: Update dietary choices of Brits data * Add snapshot and data steps * Archive unused steps --- dag/archive/main.yml | 1 + dag/archive/survey.yml | 10 ++ dag/survey.yml | 22 +++-- .../2025-03-04/dietary_choices_uk.meta.yml | 94 +++++++++++++++++++ .../survey/2025-03-04/dietary_choices_uk.py | 79 ++++++++++++++++ .../survey/2025-03-04/dietary_choices_uk.py | 57 +++++++++++ .../survey/2025-03-04/dietary_choices_uk.py | 37 ++++++++ .../survey/2025-03-04/dietary_choices_uk.py | 29 ++++++ .../2025-03-04/dietary_choices_uk.xlsx.dvc | 25 +++++ 9 files changed, 344 insertions(+), 10 deletions(-) create mode 100644 dag/archive/survey.yml create mode 100644 etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.meta.yml create mode 100644 etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.py create mode 100644 etl/steps/data/grapher/survey/2025-03-04/dietary_choices_uk.py create mode 100644 etl/steps/data/meadow/survey/2025-03-04/dietary_choices_uk.py create mode 100644 snapshots/survey/2025-03-04/dietary_choices_uk.py create mode 100644 snapshots/survey/2025-03-04/dietary_choices_uk.xlsx.dvc diff --git a/dag/archive/main.yml b/dag/archive/main.yml index 30108c1f50d..ea1371650e7 100644 --- a/dag/archive/main.yml +++ b/dag/archive/main.yml @@ -423,3 +423,4 @@ include: - dag/archive/education.yml - dag/archive/biodiversity.yml - dag/archive/space.yml + - dag/archive/survey.yml diff --git a/dag/archive/survey.yml b/dag/archive/survey.yml new file mode 100644 index 00000000000..4150364515d --- /dev/null +++ b/dag/archive/survey.yml @@ -0,0 +1,10 @@ +steps: + # + # YouGov - Dietary choices of Brits. + # + data://meadow/survey/2024-04-01/dietary_choices_uk: + - snapshot://survey/2024-04-01/dietary_choices_uk.xlsx + data://garden/survey/2024-04-01/dietary_choices_uk: + - data://meadow/survey/2024-04-01/dietary_choices_uk + data://grapher/survey/2024-04-01/dietary_choices_uk: + - data://garden/survey/2024-04-01/dietary_choices_uk diff --git a/dag/survey.yml b/dag/survey.yml index 6113cbc72c4..b57678e7fbf 100644 --- a/dag/survey.yml +++ b/dag/survey.yml @@ -2,13 +2,15 @@ steps: # # YouGov - Dietary choices of Brits. # - data://meadow/survey/2024-04-01/dietary_choices_uk: - - snapshot://survey/2024-04-01/dietary_choices_uk.xlsx - data://garden/survey/2024-04-01/dietary_choices_uk: - - data://meadow/survey/2024-04-01/dietary_choices_uk - data://grapher/survey/2024-04-01/dietary_choices_uk: - - data://garden/survey/2024-04-01/dietary_choices_uk - - ###################################################################################################################### - # Older versions to be archived once they are not used by any other steps. - ###################################################################################################################### + data://meadow/survey/2025-03-04/dietary_choices_uk: + - snapshot://survey/2025-03-04/dietary_choices_uk.xlsx + # + # YouGov - Dietary choices of Brits. + # + data://garden/survey/2025-03-04/dietary_choices_uk: + - data://meadow/survey/2025-03-04/dietary_choices_uk + # + # YouGov - Dietary choices of Brits. + # + data://grapher/survey/2025-03-04/dietary_choices_uk: + - data://garden/survey/2025-03-04/dietary_choices_uk diff --git a/etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.meta.yml b/etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.meta.yml new file mode 100644 index 00000000000..562673cfe12 --- /dev/null +++ b/etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.meta.yml @@ -0,0 +1,94 @@ +definitions: + common: + processing_level: minor + presentation: + attribution_short: YouGov + topic_tags: + - Diet Compositions + - Animal Welfare + - Food Supply + display: &common-display + numDecimalPlaces: 0 + +dataset: + update_period_days: 365 + +tables: + dietary_choices_uk: + variables: + base: + title: Weighted number of responses + description_short: |- + Number of responses, after applying weights, of a particular group (e.g. "18-24" or "Female") and date. + unit: "" + short_unit: "" + base_unweighted: + title: Number of responses, before applying weights, of a particular group (e.g. "18-24" or "Female") and date. + unit: "" + short_unit: "" + meat_eater: + title: Percentage of meat eaters + description_short: |- + Percentage of participants who responded to the question "Which of these best describes your diet?" with "Meat eater: eat meat and/or poultry". + unit: "%" + short_unit: "%" + display: + name: Meat eater + <<: *common-display + presentation: + title_public: Percentage of meat eaters + flexitarian: + title: Percentage of flexitarians + description_short: |- + Percentage of participants who responded to the question "Which of these best describes your diet?" with "Flexitarian: mainly vegetarian, but occasionally eat meat or fish". + unit: "%" + short_unit: "%" + display: + name: Flexitarian + <<: *common-display + presentation: + title_public: Percentage of flexitarians + pescetarian: + title: Percentage of pescetarians + description_short: |- + Percentage of participants who responded to the question "Which of these best describes your diet?" with "Pescetarian: eat fish but do not eat meat or poultry". + unit: "%" + short_unit: "%" + display: + name: Pescetarian + <<: *common-display + presentation: + title_public: Percentage of pescetarians + vegetarian: + title: Percentage of vegetarians + description_short: |- + Percentage of participants who responded to the question "Which of these best describes your diet?" with "Vegetarian: do not eat any meat, poultry, game, fish, or shellfish". + unit: "%" + short_unit: "%" + display: + name: Vegetarian + <<: *common-display + presentation: + title_public: Percentage of vegetarians + vegan: + title: Percentage of vegans + description_short: |- + Percentage of participants who responded to the question "Which of these best describes your diet?" with "Plant-based / Vegan: do not eat dairy products, eggs, or any other animal product". + unit: "%" + short_unit: "%" + display: + name: Vegan + <<: *common-display + presentation: + title_public: Percentage of vegans + none: + title: Percentage of people with other diets + description_short: |- + Percentage of participants who responded to the question "Which of these best describes your diet?" with "None of these". + unit: "%" + short_unit: "%" + display: + name: None of these + <<: *common-display + presentation: + title_public: Percentage of people with other diets diff --git a/etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.py b/etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.py new file mode 100644 index 00000000000..6ab1c00588b --- /dev/null +++ b/etl/steps/data/garden/survey/2025-03-04/dietary_choices_uk.py @@ -0,0 +1,79 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from owid.catalog import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +COLUMNS = { + "Base": "base", + "Unweighted base": "base_unweighted", + "Flexitarian (mainly vegetarian, but occasionally eat meat or fish)": "flexitarian", + "Meat eater (eat meat and/or poultry)": "meat_eater", + "None of these": "none", + "Pescetarian (eat fish but do not eat meat or poultry)": "pescetarian", + "Plant-based / Vegan (do not eat dairy products, eggs, or any other animal product)": "vegan", + "Vegetarian (do not eat any meat, poultry, game, fish or shellfish)": "vegetarian", +} + + +def run_sanity_checks(tb: Table) -> None: + error = "Percentages do not add up to 100% for some of the surveyed dates (within 2.5%)." + assert (abs(tb.drop(columns=["base", "base_unweighted"]).sum(axis=1) - 100) <= 2.5).all(), error + + error = "Negative values found in the table." + assert (tb >= 0).all().all(), error + + error = "Base and unweighted base, on a given date, should add up to the same number (or at least within 1%)." + _tb = tb.groupby(["date"]).agg({"base": "sum", "base_unweighted": "sum"}) + assert ((100 * abs(_tb["base"] - _tb["base_unweighted"]) / _tb["base_unweighted"]) < 1).all() + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset and read its main table. + ds_meadow = paths.load_dataset("dietary_choices_uk") + tb = ds_meadow.read("dietary_choices_uk") + + # + # Process data. + # + # Rename diet column name for convenience. + tb = tb.rename(columns={"which_of_these_best_describes_your_diet": "diet"}, errors="raise") + + # Rename diets. + tb["diet"] = tb["diet"].map(COLUMNS) + + # Transform the table to long format. + tb = tb.melt(id_vars=["diet", "group"], var_name="date", value_name="value") + + # Format date column. + tb["date"] = tb["date"].str[1:].str.replace("_", "-") + + # Transform the table to wide format. + tb = tb.pivot(index=["group", "date"], columns="diet", values="value", join_column_levels_with="_") + + # Convert fractions into percentages. + tb[tb.drop(columns=["group", "date", "base", "base_unweighted"]).columns] *= 100 + + # Ensure columns have the right type. + tb = tb.astype({"base": int, "base_unweighted": int}) + + # Improve table format. + tb = tb.format(keys=["group", "date"], sort_columns=True) + + # Sanity checks on outputs. + run_sanity_checks(tb=tb) + + # + # Save outputs. + # + # Initialize a new garden dataset. + ds_garden = create_dataset(dest_dir, tables=[tb]) + + # Save garden dataset. + ds_garden.save() diff --git a/etl/steps/data/grapher/survey/2025-03-04/dietary_choices_uk.py b/etl/steps/data/grapher/survey/2025-03-04/dietary_choices_uk.py new file mode 100644 index 00000000000..acec47be0c6 --- /dev/null +++ b/etl/steps/data/grapher/survey/2025-03-04/dietary_choices_uk.py @@ -0,0 +1,57 @@ +"""Load a garden dataset and create a grapher dataset.""" + +import pandas as pd + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + +# Groups to select from the survey data. +SELECTED_GROUPS = ["All adults", "18-24", "25-49", "50-64", "65+"] + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load garden dataset and read its main table. + ds_garden = paths.load_dataset("dietary_choices_uk") + tb = ds_garden.read("dietary_choices_uk", safe_types=False) + + # + # Process data. + # + # Adapt table format to grapher requirements. + tb = tb.rename(columns={"group": "country", "date": "year"}, errors="raise").drop( + columns=["base", "base_unweighted"], errors="raise" + ) + + # Select only the groups that are going to be displayed in grapher. + tb = tb[tb["country"].isin(SELECTED_GROUPS)].reset_index(drop=True) + + # Sanity check. + error = "A survey group may have been renamed." + assert set(tb["country"]) == set(SELECTED_GROUPS), error + + # Prepare display metadata. + date_earliest = tb["year"].astype(str).min() + for column in tb.drop(columns=["country", "year"]).columns: + tb[column].metadata.display["yearIsDay"] = True + tb[column].metadata.display["zeroDay"] = date_earliest + + # Convert year column into a number of days since the earliest date in the table. + tb["year"] = tb["year"].astype("datetime64") + tb["year"] = (tb["year"] - pd.to_datetime(date_earliest)).dt.days + + # Improve table format. + tb = tb.format() + + # + # Save outputs. + # + # Initialize a new grapher dataset. + ds_grapher = create_dataset(dest_dir, tables=[tb]) + + # Save grapher dataset. + ds_grapher.save() diff --git a/etl/steps/data/meadow/survey/2025-03-04/dietary_choices_uk.py b/etl/steps/data/meadow/survey/2025-03-04/dietary_choices_uk.py new file mode 100644 index 00000000000..1243ad42a20 --- /dev/null +++ b/etl/steps/data/meadow/survey/2025-03-04/dietary_choices_uk.py @@ -0,0 +1,37 @@ +"""Load a snapshot and create a meadow dataset.""" + +import owid.catalog.processing as pr + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load snapshot and read its main table. + snap = paths.load_snapshot("dietary_choices_uk.xlsx") + data = snap.ExcelFile() + + # + # Process data. + # + # Combine all sheets into a single table. + tb = pr.concat( + [data.parse(sheet_name=sheet_name).assign(**{"group": sheet_name}) for sheet_name in data.sheet_names] + ) + + # Improve table format. + tb = tb.format(["which_of_these_best_describes_your_diet", "group"]) + + # + # Save outputs. + # + # Initialize a new meadow dataset. + ds_meadow = create_dataset(dest_dir, tables=[tb]) + + # Save meadow dataset. + ds_meadow.save() diff --git a/snapshots/survey/2025-03-04/dietary_choices_uk.py b/snapshots/survey/2025-03-04/dietary_choices_uk.py new file mode 100644 index 00000000000..bbc1bd50354 --- /dev/null +++ b/snapshots/survey/2025-03-04/dietary_choices_uk.py @@ -0,0 +1,29 @@ +"""Script to create a snapshot of dataset. + +NOTE: The date_published is assumed to be the latest date in the spreadsheet. +In the future, consider extracting this date programmatically (the name of the latest column in the "All adults" sheet). + +""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Initialize a new snapshot. + snap = Snapshot(f"survey/{SNAPSHOT_VERSION}/dietary_choices_uk.xlsx") + + # Save snapshot. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main() diff --git a/snapshots/survey/2025-03-04/dietary_choices_uk.xlsx.dvc b/snapshots/survey/2025-03-04/dietary_choices_uk.xlsx.dvc new file mode 100644 index 00000000000..93aae996b93 --- /dev/null +++ b/snapshots/survey/2025-03-04/dietary_choices_uk.xlsx.dvc @@ -0,0 +1,25 @@ +meta: + origin: + producer: YouGov + title: Dietary choices of Brits + description: |- + This dataset contains the result of YouGov surveys asking the question: "Which of these best describes your diet?". The available responses were: + - Meat eater: eat meat and/or poultry. + - Flexitarian: mainly vegetarian, but occasionally eat meat or fish. + - Pescetarian: eat fish but do not eat meat or poultry. + - Vegetarian: do not eat any meat, poultry, game, fish, or shellfish. + - Plant-based / Vegan: do not eat dairy products, eggs, or any other animal product. + - None of these. + citation_full: YouGov (2024) - Dietary choices of Brits (e.g. vegeterian, flexitarian, meat-eater etc)? + attribution_short: YouGov + url_main: https://yougov.co.uk/topics/society/trackers/dietery-choices-of-brits-eg-vegeterian-flexitarian-meat-eater-etc + url_download: https://yougov.co.uk/_pubapis/v5/uk/trackers/dietery-choices-of-brits-eg-vegeterian-flexitarian-meat-eater-etc/download/ + date_accessed: '2025-03-04' + date_published: '2025-01-01' + license: + name: Copyright © 2024 YouGov PLC + url: https://yougov.co.uk/about/terms +outs: + - md5: 94e71e7772be4b5230c58d76c0e46646 + size: 24693 + path: dietary_choices_uk.xlsx