Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor(data): folder structure #19

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed data-preparation/data/_raw.7z
Binary file not shown.
Empty file.
Empty file.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,23 +1,40 @@
import sys

sys.path.append("../../")
from src.sdp_data.transformation.demographic.population import GapMinderPerZoneAndCountryProcessor, PopulationPerZoneAndCountryProcessor
from src.sdp_data.transformation.demographic.population import StatisticsPerCapitaJoiner
from src.sdp_data.transformation.co2_consumption_based_accounting import EoraCo2TradePerZoneAndCountryProcessor
from src.sdp_data.transformation.footprint_vs_territorial import FootprintVsTerrotorialProcessor
from src.sdp_data.transformation.demographic.worldbank_scrap import WorldBankScrapper
from src.sdp_data.transformation.demographic.gdp import GdpMaddissonPerZoneAndCountryProcessor, GdpWorldBankPerZoneAndCountryProcessor
from src.sdp_data.transformation.eia import EiaConsumptionGasBySectorProcessor, EiaConsumptionOilPerProductProcessor, EiaFinalEnergyConsumptionProcessor, EiaFinalEnergyPerSectorPerEnergyProcessor, EiaElectricityGenerationByEnergyProcessor, EiaConsumptionOilsPerSectorProcessor, EiaFinalEnergyConsumptionPerSectorProcessor
from src.sdp_data.utils.format import StatisticsDataframeFormatter
from src.sdp_data.transformation.ghg.pik import PikCleaner
from src.sdp_data.transformation.ghg.edgar import EdgarCleaner
from src.sdp_data.transformation.ghg.ghg import GhgPikEdgarCombinator, PikUnfcccAnnexesCombinator, EdgarUnfcccAnnexesCombinator, GhgMultiSourcesCombinator
from src.sdp_data.transformation.ghg.unfcc import UnfcccAnnexesCleaner, UnfccProcessor
from src.sdp_data.transformation.ghg.fao import FaoDataProcessor
from src.sdp_data.transformation.ghg.cait import CaitProcessor
import pandas as pd
import os

import pandas as pd
import requests
from pandas import json_normalize
from src.source_aggregations.co2_consumption_based_accounting import \
EoraCo2TradePerZoneAndCountryProcessor
from src.source_aggregations.demographic.gdp import (
GdpMaddissonPerZoneAndCountryProcessor,
GdpWorldBankPerZoneAndCountryProcessor)
from src.source_aggregations.footprint_vs_territorial import \
FootprintVsTerrotorialProcessor
from src.source_aggregations.ghg.cait import CaitProcessor
from src.source_aggregations.ghg.edgar import EdgarCleaner
from src.source_aggregations.ghg.fao import FaoDataProcessor
from src.source_aggregations.ghg.ghg import (EdgarUnfcccAnnexesCombinator,
GhgMultiSourcesCombinator,
GhgPikEdgarCombinator,
PikUnfcccAnnexesCombinator)
from src.source_aggregations.ghg.pik import PikCleaner
from src.source_aggregations.ghg.unfcc import (UnfcccAnnexesCleaner,
UnfccProcessor)
from src.sources.eia.eia import (EiaConsumptionGasBySectorProcessor,
EiaConsumptionOilPerProductProcessor,
EiaConsumptionOilsPerSectorProcessor,
EiaElectricityGenerationByEnergyProcessor,
EiaFinalEnergyConsumptionPerSectorProcessor,
EiaFinalEnergyConsumptionProcessor,
EiaFinalEnergyPerSectorPerEnergyProcessor)
from src.sources.gapminder.population import (
GapMinderPerZoneAndCountryProcessor, PopulationPerZoneAndCountryProcessor,
StatisticsPerCapitaJoiner)
from src.sources.worldbank.worldbank_scrap import WorldBankScrapper
from src.utils.format import StatisticsDataframeFormatter

RAW_DATA_DIR = os.path.join(os.path.dirname(__file__), "../../results/raw_new_data")
RESULTS_DIR = os.path.join(os.path.dirname(__file__), "../../results/new_prod_data")
Expand All @@ -39,7 +56,7 @@ def process_population_data(self, df_country):
df_population_raw = WorldBankScrapper().run("population")
df_population = PopulationPerZoneAndCountryProcessor().run(df_population_raw, df_country)
df_population.to_csv(f"{RESULTS_DIR}/DEMOGRAPHIC_POPULATION_WORLDBANK_prod.csv", index=False)

# update GapMinder data (source GapMinder)
df_population_gapmidner_raw = pd.read_excel(f"{RAW_DATA_DIR}/population/GM-Population - Dataset - v7.xlsx", sheet_name="data-pop-gmv6-in-columns")
df_gapminder = GapMinderPerZoneAndCountryProcessor().run(df_population_gapmidner_raw, df_country)
Expand All @@ -56,23 +73,23 @@ def process_footprint_vs_territorial_data(self, df_country, df_population):

df_footprint_vs_territorial_per_capita = StatisticsPerCapitaJoiner().run_footprint_vs_territorial_per_capita(df_footprint_vs_territorial, df_population)
df_footprint_vs_territorial_per_capita.to_csv(f"{RESULTS_DIR}/CO2_CBA_PER_CAPITA_eora_cba_zones_per_capita_prod.csv", index=False)

def process_iea_data(self, df_country):

# gas products
df_gas_cons_by_sector = EiaConsumptionGasBySectorProcessor().prepare_data(df_country)
df_gas_cons_by_sector.to_csv(f"{RESULTS_DIR}/FINAL_CONS_GAS_BY_SECTOR_prod.csv", index=False)
df_original = pd.read_csv(f"{CURRENT_DATA_DIR}/final_cons_gas_by_sector_prod.csv", sep=',')
df_original = StatisticsDataframeFormatter.select_and_sort_values(df_original, "final_energy", round_statistics=4)
df_original.to_csv(f"{CURRENT_PROD_DATA}/FINAL_CONS_GAS_BY_SECTOR_prod.csv", index=False)

# oil products
# oil products
df_oil_cons_per_product = EiaConsumptionOilPerProductProcessor().prepare_data(df_country)
df_oil_cons_per_product.to_csv(f"{RESULTS_DIR}/FINAL_CONS_OIL_BY_PRODUCT_prod.csv", index=False)
df_original = pd.read_csv(f"{CURRENT_DATA_DIR}/final_cons_oil_products_by_product.csv", sep=',')
df_original = StatisticsDataframeFormatter.select_and_sort_values(df_original, "final_energy", round_statistics=4)
df_original.to_csv(f"{CURRENT_PROD_DATA}/FINAL_CONS_OIL_BY_PRODUCT_prod.csv", index=False)

df_oil_cons_per_sector = EiaConsumptionOilsPerSectorProcessor().prepare_data(df_country)
df_oil_cons_per_sector.to_csv(f"{RESULTS_DIR}/FINAL_CONS_OIL_BY_SECTOR_prod.csv", index=False)
df_original = pd.read_csv(f"{CURRENT_DATA_DIR}/final_cons_oil_products_by_sector_prod.csv", sep=',')
Expand Down Expand Up @@ -164,7 +181,7 @@ def process_ghg_data(self, df_country):
df_unfccc_annex_1 = pd.read_excel(os.path.join(os.path.dirname(__file__), "../../data/thibaud/ghg/" + "unfccc_annex1.xlsx"))
df_unfccc_annex_2 = pd.read_excel(os.path.join(os.path.dirname(__file__), "../../data/thibaud/ghg/" + "unfccc_annex2.xlsx"))
df_unfccc_annex_clean = UnfcccAnnexesCleaner().run(df_unfccc_annex_1, df_unfccc_annex_2)

# combine PIK and UNFCCC annexes data
df_pik_unfccc_annexes = PikUnfcccAnnexesCombinator().run(df_pik_cleaned, df_unfccc_annex_clean)
df_pik_unfccc_annexes.to_csv(f"{RESULTS_DIR}/GHG_PIK_UNFCCC_prod.csv", index=False)
Expand Down
File renamed without changes.
Binary file removed data-preparation/src/sdp_data/data/_raw.7z
Binary file not shown.
10 changes: 0 additions & 10 deletions data-preparation/src/sdp_data/sources/raw_owid_TODO.py

This file was deleted.

Empty file.
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from src.sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from src.sdp_data.utils.iso3166 import countries_by_alpha3
from src.utils.iso3166 import countries_by_alpha3
from src.utils.translation import CountryTranslatorFrenchToEnglish


class EoraCo2TradePerZoneAndCountryProcessor:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd
from src.sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from src.sdp_data.transformation.demographic.countries import StatisticsPerCountriesAndZonesJoiner
from src.sdp_data.utils.iso3166 import countries_by_alpha3
from src.source_aggregations.demographic.countries import \
StatisticsPerCountriesAndZonesJoiner
from src.utils.iso3166 import countries_by_alpha3
from src.utils.translation import CountryTranslatorFrenchToEnglish


class GdpMaddissonPerZoneAndCountryProcessor:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
Footprint versus territorial emissions
"""
import pandas as pd
from src.sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from src.sdp_data.transformation.demographic.countries import StatisticsPerCountriesAndZonesJoiner
from src.source_aggregations.demographic.countries import \
StatisticsPerCountriesAndZonesJoiner
from src.utils.translation import CountryTranslatorFrenchToEnglish


class EoraCbaPerZoneAndCountryProcessor:
Expand Down Expand Up @@ -74,7 +75,7 @@ def run(self, df_eora_cba: pd.DataFrame, df_country: pd.DataFrame):
# filter on sectors of interest
list_scope_to_filter = ["Territorial Emissions", "CO2 Footprint"]
df_eora_cba = df_eora_cba[df_eora_cba["scope"].isin(list_scope_to_filter)]

# join with countries
list_cols_group_by = ['group_type', 'group_name', 'year', 'scope', 'co2_unit', 'source']
dict_agg = {'co2': "sum"}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
from sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from src.utils.translation import CountryTranslatorFrenchToEnglish


class CaitProcessor:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd
import numpy as np
from src.sdp_data.utils.translation import CountryTranslatorFrenchToEnglish, SectorTranslator
from src.sdp_data.utils.format import StatisticsDataframeFormatter
import pandas as pd
from src.utils.format import StatisticsDataframeFormatter
from src.utils.translation import (CountryTranslatorFrenchToEnglish,
SectorTranslator)


class EdgarCleaner:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import pandas as pd
from sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from sdp_data.transformation.demographic.countries import StatisticsPerCountriesAndZonesJoiner
from sdp_data.utils.format import StatisticsDataframeFormatter
from src.source_aggregations.demographic.countries import \
StatisticsPerCountriesAndZonesJoiner
from src.utils.format import StatisticsDataframeFormatter
from src.utils.translation import CountryTranslatorFrenchToEnglish


class FaoDataProcessor:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import pandas as pd
from src.sdp_data.transformation.demographic.countries import (
StatisticsPerCountriesAndZonesJoiner,
)
import numpy as np
from src.sdp_data.utils.format import StatisticsDataframeFormatter
import pandas as pd
from src.source_aggregations.demographic.countries import \
StatisticsPerCountriesAndZonesJoiner
from src.utils.format import StatisticsDataframeFormatter


class GhgPikEdgarCombinator:
Expand Down Expand Up @@ -72,7 +71,7 @@ def compute_pik_edgar_energy_ratio(self, df_pik_clean, df_edgar_clean):
"ghg_pik", "ghg_edgar"])

# concatenate with EDGAR and PIK transport and energy
df_edgar_transport_energy = df_edgar_clean[df_edgar_clean["sector"].isin(["Transport", "Electricity & Heat", "Other Energy"])]
df_edgar_transport_energy = df_edgar_clean[df_edgar_clean["sector"].isin(["Transport", "Electricity & Heat", "Other Energy"])]
df_pik_edgar_diff_industry_transport = pd.concat([df_pik_edgar_diff_industry, df_edgar_transport_energy], axis=0)

# merge on PIK energy
Expand All @@ -92,7 +91,7 @@ def compute_pik_edgar_energy_ratio(self, df_pik_clean, df_edgar_clean):
df_pik_edgar_ratio["ratio"] = df_pik_edgar_ratio["ghg_edgar"] / df_pik_edgar_ratio["ghg_pik"]

return df_pik_edgar_ratio

def compute_pik_edgar_extrapolated_glued(self, df_pik_clean, df_edgar_clean): # TODO - revoir complètement cette méthode. Dette technique monstrueuse...
# compute the energy ratio between PIK and Edgar
print("\n----- Combine PIK and EDGAR extrapolated")
Expand Down Expand Up @@ -199,8 +198,8 @@ def run(self, df_pik_clean, df_edgar_clean, df_fao_clean, df_cait_sector_stacked
df_multi_sources_sum_per_country["group_type"] = "country"
df_multi_sources_sum_per_country = df_multi_sources_sum_per_country.rename(columns={"group_name": "country"})
df_ghg_multi_with_zones = pd.concat([df_multi_sources, df_multi_sources_sum_per_country, df_fao_clean], axis=0)
# group by GAS and merge with CAIT

# group by GAS and merge with CAIT
list_group_by_gas = ["source", "group_type", "group_name", "year", "gas"]
df_ghg_multi_by_gas = df_ghg_multi_with_zones.groupby(list_group_by_gas).agg(ghg=("ghg", "sum"), ghg_unit=("ghg_unit", "first")).reset_index()
df_cait_gas_stacked["source"] = "CAIT"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pandas as pd
from src.sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from src.sdp_data.utils.format import StatisticsDataframeFormatter
from src.utils.format import StatisticsDataframeFormatter
from src.utils.translation import CountryTranslatorFrenchToEnglish


class PikCleaner:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
from src.sdp_data.utils.translation import CountryTranslatorFrenchToEnglish, SectorTranslator
from src.sdp_data.utils.iso3166 import countries_by_alpha3
from src.utils.iso3166 import countries_by_alpha3
from src.utils.translation import (CountryTranslatorFrenchToEnglish,
SectorTranslator)


class UnfccProcessor:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
from sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from sdp_data.transformation.demographic.countries import StatisticsPerCountriesAndZonesJoiner
from src.source_aggregations.demographic.countries import \
StatisticsPerCountriesAndZonesJoiner
from src.utils.translation import CountryTranslatorFrenchToEnglish


class HistoricalCo2PerZoneAndCountryProcessor:
Expand All @@ -18,7 +19,7 @@ def retrieve_urss_countries(self, df_pik_cleaned):
condition_urss = (df_pik_cleaned['country'].isin(self.list_urss_countries) & (df_pik_cleaned['year'] >= 1922) & (df_pik_cleaned['year'] < 1992))
df_pik_cleaned.loc[condition_urss, 'country'] = 'Russian Federation & USSR'
return df_pik_cleaned

@staticmethod
def melt_years(df: pd.DataFrame):
return pd.melt(df, id_vars=["type", "country", "unit"],
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import pandas as pd
import numpy as np
import sys

import numpy as np
import pandas as pd

sys.path.insert(0, r'C:\Users\HP\Desktop\shiftdataportal_data')
from src.sdp_data.utils.translation import CountryTranslatorFrenchToEnglish
from src.utils.translation import CountryTranslatorFrenchToEnglish


class CoalReservesConsolidatedProdGenerator:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pandas as pd
from src.countries.multi_selection_country_groups import \
from src.sources.countries.multi_selection_country_groups import \
process_multi_selection_country_groups

df = pd.read_csv(
"src/countries/data/multiselect_groups.csv",
"src/sources/countries/data/multiselect_groups.csv",
sep=",",
)
multi_selection_country_groups = process_multi_selection_country_groups(df)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
from src.countries.new_country_group_member import add_new_members_to_group
from src.sources.countries.new_country_group_member import \
add_new_members_to_group


def process_multi_selection_country_groups(raw_multi_selection_country_groups: pd.DataFrame) -> pd.DataFrame:
Expand Down
Loading