diff --git a/README.md b/README.md index 9b99762..72c063f 100644 --- a/README.md +++ b/README.md @@ -18,12 +18,14 @@ The remainder of the charts in the response can be produced from code in the rep - Install packages listed in `requirements.txt` - Activate conda environment: `conda activate asf_welsh_energy_consultation` - Run `make inputs-pull` to pull the zipped static data from S3 and put it in `/inputs` - - Run `python asf_welsh_energy_consultation/analysis/produce_plots.py --local_data_dir `. You need to specify the path to the local directory where your local copy of the EPC data is/will be saved by replacing `` with the path to your "ASF_data" directory or equivalent. - If you don't have a local directory for ASF core data, you can create a folder called "ASF_data" in your home directory. You can specify which - batch of EPC data to download and MCS data to load from S3 by passing the `--epc_batch` and `--mcs_batch` arguments, both - default to downloading/loading the newest data from S3, respectively. Run `python asf_welsh_energy_consultation/analysis/produce_plots.py -h` for more info. + If you don't have a local directory for ASF core data, you can create a folder called "ASF_data" in your home directory. + +- You can specify which batch of EPC data to download and MCS data to load from S3 by passing the `--epc_batch` and `--mcs_batch` arguments, both + default to downloading/loading the newest data from S3, respectively. You can also specify which set of supplementary data should be used by passing + the `--supp_data` argument followed by the name of the directory, e.g. data_202310. See the `Historical analyses` section below to see which version was used for each analysis. + Run `python asf_welsh_energy_consultation/analysis/produce_plots.py -h` for more info. The script should generate the following six plots which will be saved in your local repo in `outputs/figures`: @@ -34,6 +36,16 @@ The script should generate the following six plots which will be saved in your l - `new_build_hp_cumulative.html` - `new_build_hp_proportion.html` +It should generate a further 10 plots, five in English and five in Welsh, saved in `outputs/figures/english` and `outputs/figures/welsh`, respectively: + +- `age_prop[_welsh].png` +- `epc_all[_welsh].html` +- `epc_hp_private_retrofit[_welsh].html` +- `epc_hp_private[_welsh].html` +- `hp_tenure[_welsh].html` + +An additional figure, `hp_map.html`, should be saved in `outputs/figures/english`. + ## Skeleton folder structure ``` @@ -57,8 +69,18 @@ outputs/ Versions/batches of data used for previous analysis are listed below. +October 2023 analysis: + +- Supplementary data: data_202310 +- EPC: 2023_Q2_complete (preprocessed) +- mcs_installations_231009.csv +- mcs_installations_epc_full_231009.csv +- off-gas-live-postcodes-2022.xlsx - check [here](https://www.xoserve.com/a-to-z/) for updates +- rurality.ods - 2011 Rural Urban Classification for small area geographies, see [here](https://www.ons.gov.uk/methodology/geography/geographicalproducts/ruralurbanclassifications) + April 2023 analysis: +- Supplementary data: data_202304 - EPC: 2022_Q4_complete (preprocessed) - mcs_installations_230315.csv - mcs_installations_epc_full_230315.csv diff --git a/asf_welsh_energy_consultation/__init__.py b/asf_welsh_energy_consultation/__init__.py index 811053a..1f5e26a 100644 --- a/asf_welsh_energy_consultation/__init__.py +++ b/asf_welsh_energy_consultation/__init__.py @@ -32,4 +32,4 @@ def get_yaml_config(file_path: Path) -> Optional[dict]: # base/global config _base_config_path = Path(__file__).parent.resolve() / "config/base.yaml" -config = get_yaml_config(_base_config_path) +config_file = get_yaml_config(_base_config_path) diff --git a/asf_welsh_energy_consultation/analysis/produce_plots.py b/asf_welsh_energy_consultation/analysis/produce_plots.py index f2843d3..20e35e4 100644 --- a/asf_welsh_energy_consultation/analysis/produce_plots.py +++ b/asf_welsh_energy_consultation/analysis/produce_plots.py @@ -4,79 +4,43 @@ """ import altair as alt +import os +from asf_welsh_energy_consultation import config_file from asf_welsh_energy_consultation.getters.get_data import get_electric_tenure -from asf_welsh_energy_consultation.pipeline.process_data import * -from asf_welsh_energy_consultation.utils.formatting import format_number +from asf_welsh_energy_consultation.getters.get_data import ( + load_wales_df, + load_wales_hp, + pc_to_coords_df, +) +from asf_welsh_energy_consultation.pipeline import process_data +from asf_core_data.getters.data_getters import logger +from asf_welsh_energy_consultation.pipeline.plotting import ( + proportions_bar_chart, + age_prop_chart, + time_series_comparison, + plot_kepler_graph, +) from nesta_ds_utils.viz.altair.formatting import setup_theme alt.data_transformers.disable_max_rows() setup_theme() - output_folder = "outputs/figures/" +time_series_min = config_file["plots"]["time_series_min_default"] if not os.path.isdir(output_folder): os.makedirs(output_folder) -def time_series_comparison( - data, - title, - y_var, - y_title, - color_var, - x_var="date:T", - x_title="Date", - domain_min="2015-01-01", - domain_max="2023-01-01", - width=600, - height=300, -): - """Generic function for plotting a line chart by category (represented by color_var). - - Args: - data (pd.DataFrame): Base data. Needs to be structured as a column of consecutive dates, - a column indicating categories and a column with cumulative values. - title (str/list): Chart title. - y_var (str): y variable. - y_title (str): y axis title. - color_var (str): Variable to split by. - x_var (str, optional): x variable. Defaults to "date:T". - x_title (str, optional): x axis title. Defaults to "Date". - domain_min (str, optional): x axis minimum. Defaults to "2015-01-01". - domain_max (str, optional): x axis maximum. Defaults to "2023-01-01". - width (int, optional): Chart width. Defaults to 600. - height (int, optional): Chart height. Defaults to 300. - - Returns: - alt.Chart: Base altair chart. - """ - chart = ( - alt.Chart( - data, - title=title, - ) - .mark_line() - .encode( - x=alt.X( - x_var, title=x_title, scale=alt.Scale(domain=[domain_min, domain_max]) - ), - y=alt.Y(y_var, title=y_title), - color=color_var, - ) - .properties(width=width, height=height) - ) - - return chart - - if __name__ == "__main__": # ====================================================== # MCS installations, by off-gas status - installations_by_gas_status = cumsums_by_variable("off_gas", "Gas status") + installations_by_gas_status = process_data.cumsums_by_variable( + "off_gas", "Gas status" + ) installations_by_gas_status_chart = time_series_comparison( data=installations_by_gas_status, @@ -96,7 +60,9 @@ def time_series_comparison( # ====================================================== # MCS installations, by rurality - installations_by_rurality = cumsums_by_variable("rurality_2_label", "Rurality") + installations_by_rurality = process_data.cumsums_by_variable( + "rurality_2_label", "Rurality" + ) installations_by_rurality_chart = time_series_comparison( data=installations_by_rurality, @@ -107,6 +73,7 @@ def time_series_comparison( y_var="Number of heat pumps:Q", y_title="Number of heat pump installations", color_var="Rurality:N", + domain_max=installations_by_rurality.date.max(), ) installations_by_rurality_chart.save( @@ -116,7 +83,7 @@ def time_series_comparison( # ====================================================== # Proportions of new builds that have heat pumps - new_build_hp_proportion = get_new_hp_counts() + new_build_hp_proportion = process_data.get_new_hp_counts() new_build_hp_proportion_chart = ( alt.Chart( @@ -129,7 +96,7 @@ def time_series_comparison( # domain ensures good margin at left/right of chart "year", title="Year", - scale=alt.Scale(domain=["2007-07-01", "2022-06-01"]), + scale=alt.Scale(domain=["2007-07-01", "2023-01-01"]), ), y=alt.Y("sum(value)", title="Number of EPCs"), # want heat pumps to be at the bottom of each bar - hacky but works @@ -144,7 +111,7 @@ def time_series_comparison( # ====================================================== # Cumulative number of new builds with heat pumps - new_build_hp_cumulative = get_new_hp_cumsums() + new_build_hp_cumulative = process_data.get_new_hp_cumsums() new_build_hp_cumulative_chart = ( alt.Chart( @@ -167,8 +134,8 @@ def time_series_comparison( # ====================================================== # Cumulative MCS retrofits - ret = get_mcs_retrofits() - ret_cumsums = cumsums_by_variable("country", "wales_col", data=ret) + ret = process_data.get_mcs_retrofits() + ret_cumsums = process_data.cumsums_by_variable("country", "wales_col", data=ret) # this function works without separating by category - 'wales_col' is a whole column of "Wales" (not used) cumulative_retrofits_chart = ( @@ -181,7 +148,7 @@ def time_series_comparison( x=alt.X( "date", title="Date", - scale=alt.Scale(domain=["2015-01-01", "2023-01-01"]), + scale=alt.Scale(domain=[time_series_min, ret_cumsums.date.max()]), ), y="Number of heat pumps", ) @@ -200,7 +167,7 @@ def time_series_comparison( alt.Chart( electric_tenure, title="Fig. 2: Properties in Wales with only electric heating, split by tenure (N = " - + format_number(N) + + "{:,}".format(N) + ")", ) .mark_bar() @@ -213,3 +180,208 @@ def time_series_comparison( ).configure_title(fontSize=20) electric_tenure_chart.save(output_folder + "electric_tenure.html") + + # ====================================================== + # Original plots and stats + + wales_df = load_wales_df(from_csv=False) + wales_hp = load_wales_hp(wales_df) + + # English plots + + # Key statistics + print("Number of heat pumps:", len(wales_hp)) + print("Number of properties in EPC:", len(wales_df)) + print( + "Estimated percentage of properties with a heat pump:", + "{:.2%}".format(len(wales_hp) / len(wales_df)), + ) + print(wales_hp.TENURE.value_counts(normalize=True)) + + epc_c_or_above_and_good_walls = wales_df.loc[ + wales_df["CURRENT_ENERGY_RATING"].isin(["A", "B", "C"]) + & wales_df["WALLS_ENERGY_EFF"].isin(["Good", "Very Good"]) + ] + + epc_c_or_above_and_good_walls_and_roof = epc_c_or_above_and_good_walls.loc[ + epc_c_or_above_and_good_walls["ROOF_ENERGY_EFF"].isin(["Good", "Very Good"]) + ] + + print( + "Number of EPC C+ properties with good or very good wall insulation:", + len(epc_c_or_above_and_good_walls), + ) + print( + "As a proportion of properties in EPC:", + len(epc_c_or_above_and_good_walls) / len(wales_df), + ) + + print( + "\nNumber of EPC C+ properties with good or very good wall and roof insulation:", + len(epc_c_or_above_and_good_walls_and_roof), + ) + print( + "As a proportion of properties in EPC:", + len(epc_c_or_above_and_good_walls_and_roof) / len(wales_df), + ) + + # Tenure of Welsh HPs + proportions_bar_chart( + wales_hp, + "TENURE", + "Fig. 3: Tenure of Welsh properties with heat pumps", + "Tenure", + "Percentage of properties", + filename="hp_tenure", + x_type="tenure", + expand_y=True, + ) + + # EPC, all + unknown_vals = len(wales_df.loc[wales_df.CURRENT_ENERGY_RATING == "unknown"]) + if unknown_vals > 0: + logger.warning( + f"{unknown_vals} properties with unknown EPC ratings. These records will be removed from the count." + ) + proportions_bar_chart( + # only one unknown EPC property so fine to just remove it + wales_df.loc[wales_df.CURRENT_ENERGY_RATING != "unknown"], + "CURRENT_ENERGY_RATING", + "Fig. 5: EPC ratings of all Welsh properties", + "Energy efficiency rating", + "Percentage of properties", + filename="epc_all", + x_type="other", + ) + + # EPC, private sector with HPs + proportions_bar_chart( + wales_hp.loc[wales_hp.TENURE.isin(["Owner-occupied", "Privately rented"])], + "CURRENT_ENERGY_RATING", + [ + "Fig. 6: EPC ratings of owner-occupied and privately rented", + "Welsh properties with heat pumps", + ], + "Energy efficiency rating", + "Percentage of properties", + filename="epc_hp_private", + x_type="other", + ) + + # EPCs, private sector with retrofitted HPs + proportions_bar_chart( + wales_hp.loc[ + wales_hp.TENURE.isin(["Owner-occupied", "Privately rented"]) + & (wales_hp.CONSTRUCTION_AGE_BAND != "2007 onwards") + ], + "CURRENT_ENERGY_RATING", + [ + "Fig. 7: EPC ratings of owner-occupied and privately rented", + "Welsh properties with heat pumps, built pre-2007", + ], + "Energy efficiency rating", + "Percentage of properties", + filename="epc_hp_private_retrofit", + x_type="other", + ) + + age_data = process_data.generate_age_data(wales_df) + age_prop_chart( + age_data, "Fig. 9: Construction age bands and energy efficiencies", "age_prop" + ) + + ## Welsh plots + + welsh_replacements = { + "TENURE": { + "Owner-occupied": "Perchen-feddiannaeth", + "Socially rented": "Rhentu cymdeithasol", + "Privately rented": "Rhentu preifat", + "Unknown": "Anhysbys", + }, + "CONSTRUCTION_AGE_BAND": { + "England and Wales: before 1900": "Cyn 1900", + "Pre-1900": "Cyn 1900", + "2007 onwards": "2007 ymlaen", + "unknown": "Anhysbys", + }, + } + + for df in [wales_df, wales_hp, age_data]: + for col in ["TENURE", "CONSTRUCTION_AGE_BAND"]: + if col in df.columns: + df[col] = df[col].replace(welsh_replacements[col]) + + # Tenure of Welsh HPs + proportions_bar_chart( + wales_hp, + "TENURE", + "Ffig. 4: Deiliadaeth eiddo â phympiau gwres yng Nghymru", + "Deiliadaeth", + "Canran yr eiddo", + filename="hp_tenure_welsh", + x_type="tenure", + expand_y=True, + language="welsh", + ) + + # EPC, all + proportions_bar_chart( + wales_df.loc[wales_df.CURRENT_ENERGY_RATING != "unknown"], + "CURRENT_ENERGY_RATING", + "Ffig. 6: Sgoriau EPC holl eiddo Cymru", + "Sgôr effeithlonrwydd ynni", + "Canran yr eiddo", + filename="epc_all_welsh", + x_type="other", + language="welsh", + ) + + # EPC, private sector with HPs + proportions_bar_chart( + wales_hp.loc[wales_hp.TENURE.isin(["Perchen-feddiannaeth", "Rhentu preifat"])], + "CURRENT_ENERGY_RATING", + [ + "Ffig. 7: Sgoriau EPC eiddo perchen-feddiannaeth a", + "rhentu preifat Cymru sydd â phympiau gwres", + ], + "Sgôr effeithlonrwydd ynni", + "Canran yr eiddo", + filename="epc_hp_private_welsh", + x_type="other", + language="welsh", + ) + + # EPCs, private sector with retrofitted HPs + proportions_bar_chart( + wales_hp.loc[ + wales_hp.TENURE.isin(["Perchen-feddiannaeth", "Rhentu preifat"]) + & (wales_hp.CONSTRUCTION_AGE_BAND != "2007 ymlaen") + ], + "CURRENT_ENERGY_RATING", + [ + "Ffig. 8: Sgoriau EPC eiddo perchen-feddiannaeth a rhentu prifat", + "Cymru sydd â phympiau gwres, a adeiladwyd cyn 2007", + ], + "Sgôr effeithlonrwydd ynni", + "Canran yr eiddo", + filename="epc_hp_private_retrofit_welsh", + x_type="other", + language="welsh", + ) + + # Ages and EPC ratings + age_prop_chart( + age_data, + "Ffig. 9: Bandiau oedran adeiladu ac effeithlonrwydd ynni", + "age_prop_welsh", + language="welsh", + ) + + # Map of Welsh HPs + wales_df = load_wales_df(from_csv=False) + pc_df = pc_to_coords_df() + + hp_hex_counts = process_data.generate_hex_counts(wales_df, pc_df) + + plot_kepler_graph(hp_hex_counts, "hp_map") diff --git a/asf_welsh_energy_consultation/config/base.yaml b/asf_welsh_energy_consultation/config/base.yaml index 855fb1c..1dff292 100644 --- a/asf_welsh_energy_consultation/config/base.yaml +++ b/asf_welsh_energy_consultation/config/base.yaml @@ -1,3 +1,5 @@ epc_data_config: epc_processing_version: "preprocessed" download_core_data_epc_version: "epc_preprocessed" +plots: + time_series_min_default: "2015-01-01" diff --git a/asf_welsh_energy_consultation/config/translation_config.py b/asf_welsh_energy_consultation/config/translation_config.py new file mode 100644 index 0000000..d2d25ae --- /dev/null +++ b/asf_welsh_energy_consultation/config/translation_config.py @@ -0,0 +1,26 @@ +quality_list = { + "english": ["Very Poor", "Poor", "Average", "Good", "Very Good"], + "welsh": ["Gwael Iawn", "Gwael", "Cymedrig", "Da", "Da Iawn"], +} + +tenure_list = { + "english": ["Owner-occupied", "Socially rented", "Privately rented", "Unknown"], + "welsh": [ + "Perchen-feddiannaeth", + "Rhentu cymdeithasol", + "Rhentu preifat", + "Anhysbys", + ], +} + +energy_efficiency_text = { + "english": "Mean energy efficiency: ", + "welsh": "Effeithlonrwydd ynni cymedrig: ", +} + +housing_stock_text = { + "english": "Percentage of Welsh housing stock", + "welsh": "Canran stoc tai Cymru", +} + +age_band_text = {"english": "Age band", "welsh": "Band oedran"} diff --git a/asf_welsh_energy_consultation/getters/get_data.py b/asf_welsh_energy_consultation/getters/get_data.py index a7b77d4..f35f900 100644 --- a/asf_welsh_energy_consultation/getters/get_data.py +++ b/asf_welsh_energy_consultation/getters/get_data.py @@ -4,15 +4,16 @@ """ from asf_welsh_energy_consultation import PROJECT_DIR -from asf_welsh_energy_consultation import config +from asf_welsh_energy_consultation import config_file from asf_core_data import load_preprocessed_epc_data, get_mcs_installations from asf_core_data.getters.mcs_getters.get_mcs_installations import ( get_processed_installations_data_by_batch, ) + from asf_core_data.getters.epc.data_batches import get_batch_path from asf_core_data.config import base_config -from asf_core_data.getters.data_getters import download_core_data, logger +from asf_core_data.getters.data_getters import download_core_data, logger, load_data import pandas as pd import numpy as np @@ -20,18 +21,6 @@ from argparse import ArgumentParser -epc_processing_version = config["epc_data_config"]["epc_processing_version"] -download_core_data_epc_version = config["epc_data_config"][ - "download_core_data_epc_version" -] - -postcode_path = "inputs/data/postcodes" -regions_path = "inputs/data/regions.csv" -off_gas_path = "inputs/data/off-gas-live-postcodes-2022.xlsx" -oa_path = "inputs/data/postcode_to_output_area.csv" -rurality_path = "inputs/data/rurality.ods" -tenure_path = "inputs/data/tenure.csv" - def create_argparser(): """ @@ -51,6 +40,13 @@ def create_argparser(): type=str, ) + parser.add_argument( + "--supp_data", + help="Name of directory where supplementary data is stored", + default="newest", + type=str, + ) + parser.add_argument( "--epc_batch", help='Specifies which EPC data batch to use in the form `YYYY_[Quarter]_complete`. Defaults to "newest"', @@ -78,10 +74,28 @@ def get_args(): """ parser = create_argparser() - return parser.parse_args() + args = parser.parse_args() + + if args.supp_data == "newest": + subdirs = [subdir for subdir in os.listdir("inputs")] + args.supp_data = max(subdirs) + + return args arguments = get_args() + +input_data_path = f"inputs/{arguments.supp_data}/" + +wales_epc_path = "wales_epc.csv" + +postcode_path = f"inputs/{arguments.supp_data}/postcodes" +regions_path = f"inputs/{arguments.supp_data}/regions.csv" +off_gas_path = f"inputs/{arguments.supp_data}/off-gas-live-postcodes-2022.xlsx" +oa_path = f"inputs/{arguments.supp_data}/postcode_to_output_area.csv" +rurality_path = f"inputs/{arguments.supp_data}/rurality.ods" +tenure_path = f"inputs/{arguments.supp_data}/tenure.csv" + LOCAL_DATA_DIR = arguments.local_data_dir @@ -232,10 +246,9 @@ def get_rurality(): return oa_rural -def check_local_epc(): +def check_local_epc(epc_processing_version=None, download_core_data_epc_version=None): """ Checks local directory for relevant EPC batch and downloads relevant EPC batch from S3 to local directory if not found. - """ epc_batch = arguments.epc_batch @@ -269,20 +282,23 @@ def check_local_epc(): ) -def get_wales_epc(): +def get_wales_processed_epc(): """Get Welsh EPC data (processed but not deduplicated). Returns: pd.DataFrame: Welsh preprocessed EPC data. """ - check_local_epc() + check_local_epc( + epc_processing_version="preprocessed", + download_core_data_epc_version="epc_preprocessed", + ) epc_batch = arguments.epc_batch wales_epc = load_preprocessed_epc_data( data_path=LOCAL_DATA_DIR, usecols=None, - version=epc_processing_version, + version="preprocessed", subset="Wales", batch=epc_batch, ) @@ -351,3 +367,97 @@ def get_electric_tenure(): ) return data + + +def load_wales_df(from_csv=True): + """Load preprocessed and deduplicated EPC dataset for Wales. + If data is loaded from all-GB file, the filtered version is saved to csv + for easier future loading. + + Args: + from_csv (bool, optional): Whether to load from saved CSV. Defaults to True. + + Returns: + pd.DataFrame: EPC data. + """ + if from_csv: + wales_epc = pd.read_csv(wales_epc_path) + else: + check_local_epc( + epc_processing_version="preprocessed_and_deduplicated", + download_core_data_epc_version="epc_preprocessed_dedupl", + ) + batch = arguments.epc_batch + wales_epc = load_preprocessed_epc_data( + data_path=LOCAL_DATA_DIR, + subset="Wales", + batch=batch, + version="preprocessed_dedupl", + usecols=[ + "LMK_KEY", + "INSPECTION_DATE", + "UPRN", + "POSTCODE", + "CURRENT_ENERGY_EFFICIENCY", + "CURRENT_ENERGY_RATING", + "WALLS_ENERGY_EFF", + "FLOOR_ENERGY_EFF", + "ROOF_ENERGY_EFF", + "CONSTRUCTION_AGE_BAND", + "TENURE", + "TRANSACTION_TYPE", + "HP_INSTALLED", + ], + ) + + wales_epc.TENURE = wales_epc.TENURE.replace( + { + "owner-occupied": "Owner-occupied", + "rental (social)": "Socially rented", + "rental (private)": "Privately rented", + "unknown": "Unknown", + } + ) + # if CONSTRUCTION_AGE_BAND is unknown and TRANSACTION_TYPE is new dwelling, + # assume construction age is >2007 because EPCs started in 2008 + wales_epc["CONSTRUCTION_AGE_BAND"].loc[ + (wales_epc.CONSTRUCTION_AGE_BAND == "unknown") + & (wales_epc.TRANSACTION_TYPE == "new dwelling") + ] = "2007 onwards" + + if not os.path.isdir(input_data_path): + os.makedirs(input_data_path) + + wales_epc.to_csv(input_data_path + wales_epc_path) + + return wales_epc + + +def load_wales_hp(wales_epc): + """Load Welsh EPC data filtered to properties with heat pumps. + + Args: + wales_epc (pd.DataFrame): Wales EPC data. + + Returns: + pd.DataFrame: EPC data filtered to properties with heat pumps. + """ + wales_hp = wales_epc.loc[wales_epc.HP_INSTALLED].reset_index(drop=True) + + return wales_hp + + +def pc_to_coords_df(): + pc_df = load_data( + data_path="S3", + file_path="inputs/supplementary_data/geospatial/ukpostcodes_to_coordindates.csv", + ) + pc_df = pc_df.rename( + columns={ + "postcode": "POSTCODE", + "latitude": "LATITUDE", + "longitude": "LONGITUDE", + } + ) + + return pc_df diff --git a/asf_welsh_energy_consultation/pipeline/plotting.py b/asf_welsh_energy_consultation/pipeline/plotting.py new file mode 100644 index 0000000..4b79be1 --- /dev/null +++ b/asf_welsh_energy_consultation/pipeline/plotting.py @@ -0,0 +1,295 @@ +# File: asf_welsh_energy_consultation/pipeline/plotting.py +""" +Defines plotting functions. +""" + +import pandas as pd +import altair as alt +import matplotlib.pyplot as plt +import matplotlib.ticker as mtick +import os +from keplergl import KeplerGl + +from asf_welsh_energy_consultation.config import translation_config +from asf_welsh_energy_consultation import config_file +from asf_core_data.getters.data_getters import logger +from nesta_ds_utils.viz.altair.formatting import setup_theme + +from asf_welsh_energy_consultation.utils.utils import arial + +alt.themes.register("arial", arial) +alt.themes.enable("arial") + +plt.rc("font", family="Arial") + +fig_output_path = { + "english": "outputs/figures/english/", + "welsh": "outputs/figures/welsh/", +} + +for file_path in fig_output_path.values(): + if not os.path.isdir(file_path): + os.makedirs(file_path) + +setup_theme() + + +def time_series_comparison( + data, + title, + y_var, + y_title, + color_var, + x_var="date:T", + x_title="Date", + domain_min=None, + domain_max=None, + width=600, + height=300, +): + """Generic function for plotting a line chart by category (represented by color_var). + + Args: + data (pd.DataFrame): Base data. Needs to be structured as a column of consecutive dates, + a column indicating categories and a column with cumulative values. + title (str/list): Chart title. + y_var (str): y variable. + y_title (str): y-axis title. + color_var (str): Variable to split by. + x_var (str, optional): x variable. Defaults to "date:T". + x_title (str, optional): x-axis title. Defaults to "Date". + domain_min (str, optional): x-axis minimum. Defaults to "2015-01-01". + domain_max (str, optional): x-axis maximum. Defaults to max date of series. + width (int, optional): Chart width. Defaults to 600. + height (int, optional): Chart height. Defaults to 300. + + Returns: + alt.Chart: Base altair chart. + """ + if domain_min is None: + domain_min = config_file["plots"]["time_series_min_default"] + logger.info(f"Time series comparison using {domain_min} as min date") + if domain_max is None: + domain_max = data.date.max() + logger.info(f"Time series comparison using {domain_max} as max date") + chart = ( + alt.Chart( + data, + title=title, + ) + .mark_line() + .encode( + x=alt.X( + x_var, title=x_title, scale=alt.Scale(domain=[domain_min, domain_max]) + ), + y=alt.Y(y_var, title=y_title), + color=color_var, + ) + .properties(width=width, height=height) + ) + + return chart + + +def proportions_bar_chart( + base_data, + field, + title, + x_label, + y_label, + filename, + expand_y=False, + x_type="good", + language="english", +): + """Create a generic bar chart of proportions of properties in a given category. + + Args: + base_data (pd.DataFrame): EPC data. + field (str): Feature name. + title (str): Chart title. + x_label (str): x axis label. + y_label (str): y axis label. + filename (str): Filename. + expand_y (bool, optional): Whether to extend the y axis beyond altair's default. Defaults to False. + x_type (str, optional): Type of x variable (to control formatting). + Can be "good" (insulation quality), "tenure", or otherwise assumed to be A-G energy efficiencies. + Defaults to "good". + language (str, optional): Language of chart text. Defaults to "english". + """ + source = pd.DataFrame({"count": base_data[field].value_counts()}).reset_index() + + if x_type == "good": + order = translation_config.quality_list[language] + elif x_type == "tenure": + order = translation_config.tenure_list[language] + else: + order = ["A", "B", "C", "D", "E", "F", "G"] + + N_count = "{:,}".format(len(base_data)) + + chart = ( + alt.Chart(source) + .transform_joinaggregate( + Total="sum(count)", + ) + .transform_calculate(PercentOfTotal="datum.count / datum.Total") + .mark_bar() + .encode( + x=alt.X("index", sort=order, title=x_label, axis=alt.Axis(labelAngle=0)), + y=alt.Y( + shorthand="PercentOfTotal:Q", + axis=alt.Axis(format=".0%"), + title=y_label, + scale=alt.Scale(domain=[0, 0.5]) if expand_y is True else alt.Scale(), + ), + ) + .properties( + width=500, + height=300, + # add N to title (just append to end if string, otherwise append to last in list of strings) + title=title + " (N = " + N_count + ")" + if type(title) == str + else title[:-1] + [title[-1] + " (N = " + N_count + ")"], + ) + ).configure_title(fontSize=20) + + chart.save(fig_output_path[language] + filename + ".html") + + print("Saved: " + filename + ".html") + + +# matplotlib only cycles through 10 colours, so manually defining 11 here to cover all age categories +colors = [ + "#000000", + "#1f77b4", + "#ff7f0e", + "#2ca02c", + "#d62728", + "#9467bd", + "#8c564b", + "#e377c2", + "#7f7f7f", + "#bcbd22", + "#17becf", +] + + +def age_prop_chart(base_data, title, filename, language="english"): + """Create single-column bar chart with property ages, proportions and average energy efficiencies. + + Args: + base_data (pd.DataFrame): EPC data. + title (str): Chart title. + filename (str): Filename. + language (str, optional): Language of chart text. Defaults to "english". + """ + + text_labels = [ + translation_config.energy_efficiency_text[language] + str(val) + for val in base_data["CURRENT_ENERGY_EFFICIENCY"] + ] + prop_labels = [str(round(val, 1)) + "%" for val in base_data["percentage"]] + width = 1 + + fig, ax = plt.subplots() + fig.set_figheight(10) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + ax.spines["bottom"].set_visible(False) + + # create initial bar + ax.bar( + x=" ", + height=base_data.loc[0, "percentage"], + width=width, + label=base_data.loc[0, "CONSTRUCTION_AGE_BAND"], + color=colors[0], + ) + + # plot remaining bars on top + for i in range(1, len(colors)): + ax.bar( + x=" ", + height=base_data.loc[i, "percentage"], + width=width, + bottom=base_data.loc[i - 1, "cumul_prop"], + label=base_data.loc[i, "CONSTRUCTION_AGE_BAND"], + color=colors[i], + ) + + # format y axis + ax.set_ylim(0, 100) + ax.yaxis.set_major_formatter(mtick.PercentFormatter(100)) + ax.set_ylabel( + translation_config.housing_stock_text[language], fontweight="bold", fontsize=12 + ) + ax.set_title(title, fontweight="bold", fontsize=14, pad=20) + + # put legend in top right + box = ax.get_position() + ax.set_position([box.x0, box.y0, box.width * 0.8, box.height]) + + handles, labels = ax.get_legend_handles_labels() + ax.legend( + reversed(handles), + reversed(labels), + loc="upper right", + bbox_to_anchor=(1.6, 1), + fontsize=10, + title=translation_config.age_band_text[language], + title_fontproperties={"weight": "bold"}, + ) + + # put text in centres of bars + rects = ax.patches + + for rect, label in zip(rects, text_labels): + height = rect.get_height() + ax.text( + rect.get_x() + rect.get_width() / 2, + rect.get_y() + height / 2, + label, + ha="center", + va="center", + color="white", + fontsize=12, + ) + + for rect, label in zip(rects, prop_labels): + height = rect.get_height() + ax.text( + rect.get_x() + rect.get_width() + 0.01, + rect.get_y() + height / 2, + label, + ha="left", + va="center", + fontsize=12, + ) + + # format axes + plt.tick_params( + axis="x", # changes apply to the x-axis + which="both", # both major and minor ticks are affected + bottom=False, # ticks along the bottom edge are off + top=False, # ticks along the top edge are off + labelbottom=False, + ) # labels along the bottom edge are off + + plt.tight_layout() + + plt.savefig(fig_output_path[language] + filename + ".png", bbox_inches="tight") + + print("Saved: " + filename + ".png") + + +def plot_kepler_graph(base_data, filename): + hex_map = KeplerGl(height=500) + hex_map.add_data( + data=base_data[["perc_true", "hex_id"]], name="Heat pump proportions" + ) + hex_map.save_to_html( + file_name=os.path.join(fig_output_path["english"], f"{filename}.html") + ) + + print("Saved: " + filename + ".html") diff --git a/asf_welsh_energy_consultation/pipeline/process_data.py b/asf_welsh_energy_consultation/pipeline/process_data.py index d434c29..b7d2e2f 100644 --- a/asf_welsh_energy_consultation/pipeline/process_data.py +++ b/asf_welsh_energy_consultation/pipeline/process_data.py @@ -5,7 +5,9 @@ import pandas as pd -from asf_welsh_energy_consultation.getters.get_data import * +from asf_core_data.utils.geospatial.data_agglomeration import add_hex_id +from asf_welsh_energy_consultation.getters import get_data +from asf_core_data.getters.data_getters import logger # PROCESSING MCS @@ -17,10 +19,10 @@ def get_enhanced_mcs(): Returns: pd.DataFrame: Dataset as described above. """ - mcs = get_mcs_domestic() - og = get_offgas() - countries = get_countries() - rural = get_rurality() + mcs = get_data.get_mcs_domestic() + og = get_data.get_offgas() + countries = get_data.get_countries() + rural = get_data.get_rurality() # join with off-gas data mcs = mcs.merge(og, on="postcode", how="left") @@ -28,15 +30,23 @@ def get_enhanced_mcs(): # join with regions in order to filter to Wales mcs = mcs.merge(countries, on="postcode", how="left") + if mcs.country.isna().sum() > 0: + logger.warning( + f"{mcs.country.isna().sum()} MCS installation records have no country match." + f"Potential loss of data when filtering for Wales." + ) mcs = mcs.loc[mcs["country"] == "Wales"].reset_index(drop=True) - # 1203 records with no match - 273 are Northern Ireland which leaves 918 + # There will be records with no match # Some will be new postcodes (new build developments) # and some may be expired postcodes # In future, implement new solution that uses outward codes # join with rurality data mcs = mcs.merge(rural, on="postcode", how="left") - # only 13 postcodes lost in this merge + if mcs.rurality_10_code.isna().sum() > 0: + logger.warning( + f"Loss of data: {mcs.rurality_10_code.isna().sum()} Welsh MCS installation records have no rurality code match." + ) # add custom rurality column (rurality "type 7": all different types of urban mapped to Urban) mcs["rurality_7"] = mcs["rurality_10_label"].replace( @@ -102,7 +112,7 @@ def cumsums_by_variable(variable, new_var_name, data=enhanced_mcs): # PROCESSING EPC -wales_epc = get_wales_epc() +wales_epc = get_data.get_wales_processed_epc() def get_wales_epc_new(): @@ -134,8 +144,13 @@ def get_new_hp_counts(): pd.DataFrame: New build HP counts. """ wales_epc_new = get_wales_epc_new() - # 2023 not yet complete so drop any post-2022 data - wales_epc_new = wales_epc_new.loc[wales_epc_new["INSPECTION_DATE"] < "2023-01-01"] + # Requires full year of data so remove most recent year if it doesn't have 12 months of data + max_date = wales_epc_new["INSPECTION_DATE"].max() + max_year = max_date.year + if max_date != pd.to_datetime(f"{max_year}-12-31"): + wales_epc_new = wales_epc_new.loc[ + wales_epc_new["INSPECTION_DATE"] < f"{max_year}-01-01" + ] new_hp_counts = ( wales_epc_new.groupby(["year", "HP_INSTALLED"]) @@ -196,10 +211,15 @@ def mcs_epc_first_records(): Returns: pd.DataFrame: MCS records joined with first EPC. """ - mcs_epc = get_mcs_epc_domestic() - regions = get_countries() + mcs_epc = get_data.get_mcs_epc_domestic() + regions = get_data.get_countries() mcs_epc = mcs_epc.merge(regions, on="postcode", how="left") + if mcs_epc.country.isna().sum() > 0: + logger.warning( + f"{mcs_epc.country.isna().sum()} joined MCS-EPC records have no country match. " + f"Potential loss of data when filtering for Wales." + ) mcs_epc = mcs_epc.loc[mcs_epc["country"] == "Wales"].reset_index(drop=True) first_records = ( @@ -256,3 +276,92 @@ def get_mcs_retrofits(): mcs_retrofits = enhanced_mcs.loc[~enhanced_mcs.index.isin(hp_when_built_indices)] return mcs_retrofits + + +def generate_hex_counts(wales_df, pc_df): + """ + Merges two dataframes on 'postcode' and generates pandas.DataFrame containing information on % of properties with + heat pumps installed in each Hex 3 partition (https://h3geo.org/docs/) in Wales + Args: + wales_df (pandas.Dataframe): df of processed EPC data for Wales + pc_df (pandas.Dataframe): df containing Welsh postcodes and corresponding lat/lon coordinates + + Returns: + pandas.Dataframe: df containing information on % of properties with heat pumps installed in each Hex3 partition + in Wales + + """ + + wales_df_coords = pd.merge( + wales_df, pc_df, on=["POSTCODE"] + ) # merge EPC with postcode df + wales_df_hex = add_hex_id(wales_df_coords, 6) # add H3 hex id to each row + hp_hex_counts = ( + wales_df_hex.groupby(["hex_id", "HP_INSTALLED"]).size().unstack(fill_value=0) + ) # get counts of HP installations in each hex id + hp_hex_counts["total"] = hp_hex_counts[True] + hp_hex_counts[False] + hp_hex_counts["perc_true"] = ( + hp_hex_counts[True] / hp_hex_counts["total"] * 100 + ) # calculate % of properties with HP in each hex + hp_hex_counts = hp_hex_counts.reset_index() + + return hp_hex_counts + + +def generate_age_data(wales_df): + """Generate table of proportion of properties in each age band. + Also includes average energy efficiency for each age band. + + Args: + wales_df (pd.DataFrame): EPC data with "CONSTRUCTION_AGE_BAND" column. + + Returns: + pd.DataFrame: Age band proportions and efficiencies. + """ + age_props = ( + wales_df.loc[ + wales_df.CONSTRUCTION_AGE_BAND != "unknown" + ].CONSTRUCTION_AGE_BAND.value_counts(normalize=True) + * 100 + ) + age_props = age_props.reset_index() + age_props = age_props.rename( + columns={ + "index": "CONSTRUCTION_AGE_BAND", + "CONSTRUCTION_AGE_BAND": "percentage", + } + ) + ages_efficiencies = ( + wales_df.groupby("CONSTRUCTION_AGE_BAND")["CURRENT_ENERGY_EFFICIENCY"] + .mean() + .reset_index() + ) + age_data = age_props.merge(ages_efficiencies, on="CONSTRUCTION_AGE_BAND") + age_data["CONSTRUCTION_AGE_BAND"] = age_data["CONSTRUCTION_AGE_BAND"].replace( + {"England and Wales: before 1900": "Pre-1900"} + ) + age_data = ( + age_data.set_index("CONSTRUCTION_AGE_BAND") + .loc[ + [ + "Pre-1900", + "1900-1929", + "1930-1949", + "1950-1966", + "1965-1975", + "1976-1983", + "1983-1991", + "1991-1998", + "1996-2002", + "2003-2007", + "2007 onwards", + ] + ] + .reset_index() + ) + age_data["CURRENT_ENERGY_EFFICIENCY"] = age_data["CURRENT_ENERGY_EFFICIENCY"].round( + 1 + ) + age_data["cumul_prop"] = age_data["percentage"].cumsum() + + return age_data diff --git a/asf_welsh_energy_consultation/utils/formatting.py b/asf_welsh_energy_consultation/utils/formatting.py deleted file mode 100644 index 93494f9..0000000 --- a/asf_welsh_energy_consultation/utils/formatting.py +++ /dev/null @@ -1,22 +0,0 @@ -# File: asf_welsh_energy_consultation/utils/formatting.py -""" -Formatting utility functions. -""" - -import re - - -def format_number(n): - """ - If number is 5 or more digits, add a comma every 3 digits from the right. - - Args: - n (int): Number to format. - - Returns: - str: Formatted number. - """ - if n > 9999: - return re.sub(r"(\d)(?=(\d{3})+(?!\d))", r"\1,", str(n)) - else: - return str(n) diff --git a/asf_welsh_energy_consultation/utils/utils.py b/asf_welsh_energy_consultation/utils/utils.py new file mode 100644 index 0000000..be4443d --- /dev/null +++ b/asf_welsh_energy_consultation/utils/utils.py @@ -0,0 +1,9 @@ +def arial(): + font = "Arial" + + return { + "config": { + "title": {"font": font}, + "axis": {"labelFont": font, "titleFont": font}, + } + } diff --git a/requirements.txt b/requirements.txt index 28a0ff2..7eb8214 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,6 +7,7 @@ matplotlib odfpy selenium==4.2.0 argparse==1.4.0 +keplergl s3fs>=2023.3.0 asf_core_data@ git+ssh://git@github.com/nestauk/asf_core_data.git nesta_ds_utils@ git+ssh://git@github.com/nestauk/nesta_ds_utils.git