From df4f1b396f5ec8856bf2b043ee2c9a512239ea96 Mon Sep 17 00:00:00 2001 From: Evan Ray Date: Tue, 22 Oct 2024 20:01:59 -0400 Subject: [PATCH] rename self.df, add training set model outputs as argument to fit method --- src/postpredict/dependence.py | 118 ++++++++++++------ .../dependence/test_build_train_X_Y.py | 12 +- .../test_schaake_build_templates.py | 8 +- .../postpredict/dependence/test_transform.py | 15 ++- 4 files changed, 94 insertions(+), 59 deletions(-) diff --git a/src/postpredict/dependence.py b/src/postpredict/dependence.py index 799f7f5..84d685e 100644 --- a/src/postpredict/dependence.py +++ b/src/postpredict/dependence.py @@ -43,9 +43,6 @@ def _build_templates(self, wide_model_out): def transform(self, model_out: pl.DataFrame, - reference_time_col: str = "reference_date", - horizon_col: str = "horizon", pred_col: str = "value", - idx_col: str = "output_type_id", obs_mask: np.ndarray | None = None, pit_templates: bool = False, return_long_format: bool = True): @@ -58,21 +55,12 @@ def transform(self, model_out: pl.DataFrame, model_out: pl.DataFrame polars dataframe with sample predictions that do not necessarily capture temporal dependence. - reference_time_col: str - name of column in model_out that records the reference time for - predictions - horizon_col: str - name of column in model_out that records the prediction horizon - pred_col: str - name of column in model_out with predicted values (samples) - idx_col: str - name of column in model_out with sample indices obs_mask: np.ndarray | None mask to use for observed data. The primary use case is to support cross-validation. If None, all observed data are used to form dependence templates. Otherwise, `obs_mask` should be a boolean - array of shape (self.df.shape[0], ). Rows of self.df where obs_mask - is True will be used, while rows of self.df where obs_mask is False + array of shape (self.target_data_train.shape[0], ). Rows of self.target_data_train where obs_mask + is True will be used, while rows of self.target_data_train where obs_mask is False will not be used. pit_templates: bool If False (default), templates are based on observed values. If True, @@ -87,19 +75,31 @@ def transform(self, model_out: pl.DataFrame, they reflect the estimated temporal dependence structure. """ # pivot model_out from long to wide format - wide_model_out = self._pivot_horizon(model_out, reference_time_col, - horizon_col, idx_col, pred_col) - min_horizon = model_out[horizon_col].min() - max_horizon = model_out[horizon_col].max() + wide_model_out = self._pivot_horizon( + model_out, self.reference_time_col, + self.horizon_col, self.idx_col, self.pred_col + ) + min_horizon = model_out[self.horizon_col].min() + max_horizon = model_out[self.horizon_col].max() - # extract train_X and train_Y from observed data (self.df) + if self.model_out_train is not None: + wide_model_out_train = self._pivot_horizon( + self.model_out_train, self.reference_time_col, + self.horizon_col, self.idx_col, self.pred_col + ) + else: + wide_model_out_train = None + + # extract train_X and train_Y from observed data (self.target_data_train) + # and/or past forecasts (wide_model_out_train) self._build_train_X_Y(min_horizon, max_horizon, obs_mask, - wide_model_out, reference_time_col, pit_templates) + wide_model_out_train, self.reference_time_col, + pit_templates) # perform the transformation, one group at a time transformed_wide_model_out = ( wide_model_out - .group_by(*(self.key_cols + [reference_time_col])) + .group_by(*(self.key_cols + [self.reference_time_col])) .map_groups(self._transform_one_group) ) @@ -107,21 +107,21 @@ def transform(self, model_out: pl.DataFrame, return transformed_wide_model_out # unpivot back to long format - pivot_index = [c for c in model_out.columns if c not in [horizon_col, pred_col]] + pivot_index = [c for c in model_out.columns if c not in [self.horizon_col, self.pred_col]] transformed_model_out = ( transformed_wide_model_out .unpivot( index = pivot_index, on = self.wide_horizon_cols, - variable_name = horizon_col, - value_name = pred_col + variable_name = self.horizon_col, + value_name = self.pred_col ) .with_columns( # convert horizon columns back to original values and data type # this is inverting an operation that was done in _pivot_horizon just before the pivot - pl.col(horizon_col) - .str.slice(len("postpredict_") + len(horizon_col), None) # keep everything after f"postpredict_{horizon_col}" prefix - .cast(model_out[horizon_col].dtype) + pl.col(self.horizon_col) + .str.slice(len("postpredict_") + len(self.horizon_col), None) # keep everything after f"postpredict_{horizon_col}" prefix + .cast(model_out[self.horizon_col].dtype) ) ) @@ -206,8 +206,8 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int, mask to use for observed data. The primary use case is to support cross-validation. If None, all observed data are used to form dependence templates. Otherwise, `obs_mask` should be a boolean - array of shape (self.df.shape[0], ). Rows of self.df where obs_mask - is True will be used, while rows of self.df where obs_mask is False + array of shape (self.target_data_train.shape[0], ). Rows of self.target_data_train where obs_mask + is True will be used, while rows of self.target_data_train where obs_mask is False will not be used. wide_model_out: pl.DataFrame polars dataframe with sample predictions that do not necessarily @@ -227,9 +227,9 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int, Notes ----- This method sets self.shift_varnames, self.train_X, and self.train_Y, - and it updates self.df to have new columns. + and it updates self.target_data_train to have new columns. - It expects the object to have the properties self.df, self.key_cols, + It expects the object to have the properties self.target_data_train, self.key_cols, self.time_col, self.obs_col, and self.feat_cols set already. """ self.shift_varnames = [] @@ -241,7 +241,7 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int, if shift_varname not in self.shift_varnames: self.shift_varnames.append(shift_varname) - self.df = self.df.with_columns( + self.target_data_train = self.target_data_train.with_columns( pl.col(self.obs_col) .shift(-h) .over(self.key_cols, order_by=self.time_col) @@ -250,7 +250,9 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int, if obs_mask is None: obs_mask = True - df_mask_and_dropnull = self.df.filter(obs_mask).drop_nulls() + df_mask_and_dropnull = self.target_data_train.filter(obs_mask).drop_nulls() + print("build train X_Y, df mask and dropnull") + print(df_mask_and_dropnull) if pit_templates: pit_values = ( @@ -267,6 +269,8 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int, how="left" ) ) + print("pit values, in _build_train_X_Y") + print(pit_values) train_X_Y_source = pit_values train_Y_cols = [f"pit_{pred_c}" for pred_c in self.wide_horizon_cols] else: @@ -370,7 +374,17 @@ def __init__(self, weighter=weighters.EqualWeighter(), super().__init__(rng) - def fit(self, df, key_cols=None, time_col="date", obs_col="value", feat_cols=["date"]): + def fit(self, + target_data_train: pl.DataFrame, + model_out_train: pl.DataFrame, + key_cols: list[str] | None = None, + time_col: str = "date", + obs_col: str = "value", + reference_time_col: str = "reference_date", + horizon_col: str = "horizon", + pred_col: str = "value", + idx_col: str = "output_type_id", + feat_cols: list[str] = ["date"]) -> None: """ Fit a Schaake shuffle model for temporal dependence across prediction horizons. In practice this just involves saving the input arguments for @@ -378,21 +392,43 @@ def fit(self, df, key_cols=None, time_col="date", obs_col="value", feat_cols=["d Parameters ---------- - df: polars dataframe with training set observations. - key_cols: names of columns in `df` used to identify observational units, - e.g. location or age group. - time_col: name of column in `df` that contains the time index. - obs_col: name of column in `df` that contains observed values. - feat_cols: names of columns in `df` with features + target_data_train: pl.DataFrame + training set observations of target data. + model_out_train: pl.DataFrame + training set predictions + key_cols: list[str] | None + names of columns in `target_data_train` and `model_out_train` used + to identify observational units, e.g. location or age group. + time_col: str + name of column in `target_data_train` that contains the time index. + obs_col: str + name of column in `target_data_train` that contains observed values. + reference_time_col: str + name of column in `model_out_train` that contains the reference time + for model predictions + horizon_col: str + name of column in `model_out_train` that contains the prediction + horizon relative to the reference time + pred_col: str + name of column in model_out with predicted values (samples) + idx_col: str + name of column in model_out with sample indices + feat_cols: list[str] + names of columns in `target_data_train` and `model_out_train` with features Returns ------- None """ - self.df = df + self.target_data_train = target_data_train + self.model_out_train = model_out_train self.key_cols = key_cols self.time_col = time_col self.obs_col = obs_col + self.reference_time_col = reference_time_col + self.horizon_col = horizon_col + self.pred_col = pred_col + self.idx_col = idx_col self.feat_cols = feat_cols diff --git a/tests/postpredict/dependence/test_build_train_X_Y.py b/tests/postpredict/dependence/test_build_train_X_Y.py index 88b82e8..27ef03c 100644 --- a/tests/postpredict/dependence/test_build_train_X_Y.py +++ b/tests/postpredict/dependence/test_build_train_X_Y.py @@ -15,7 +15,7 @@ def test_build_train_X_Y_positive_horizons(obs_data, monkeypatch): # See https://stackoverflow.com/a/77748100 monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set()) tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42)) - tdp.df = obs_data + tdp.target_data_train = obs_data tdp.key_cols = ["location", "age_group"] tdp.time_col = "date", tdp.obs_col = "value" @@ -47,7 +47,7 @@ def test_build_train_X_Y_nonnegative_horizons(obs_data, monkeypatch): # See https://stackoverflow.com/a/77748100 monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set()) tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42)) - tdp.df = obs_data + tdp.target_data_train = obs_data tdp.key_cols = ["location", "age_group"] tdp.time_col = "date", tdp.obs_col = "value" @@ -80,7 +80,7 @@ def test_build_train_X_Y_negative_horizons(obs_data, monkeypatch): # See https://stackoverflow.com/a/77748100 monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set()) tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42)) - tdp.df = obs_data + tdp.target_data_train = obs_data tdp.key_cols = ["location", "age_group"] tdp.time_col = "date", tdp.obs_col = "value" @@ -113,7 +113,7 @@ def test_build_train_X_Y_mask(obs_data, monkeypatch): # See https://stackoverflow.com/a/77748100 monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set()) tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42)) - tdp.df = obs_data + tdp.target_data_train = obs_data tdp.key_cols = ["location", "age_group"] tdp.time_col = "date", tdp.obs_col = "value" @@ -147,7 +147,7 @@ def test_build_train_X_Y_pit_templates(obs_data, wide_model_out, monkeypatch): # See https://stackoverflow.com/a/77748100 monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set()) tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42)) - tdp.df = obs_data + tdp.target_data_train = obs_data tdp.key_cols = ["location", "age_group"] tdp.time_col = "date", tdp.obs_col = "value" @@ -193,7 +193,7 @@ def test_build_train_X_Y_pit_templates_mask(obs_data, wide_model_out, monkeypatc # See https://stackoverflow.com/a/77748100 monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set()) tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42)) - tdp.df = obs_data + tdp.target_data_train = obs_data tdp.key_cols = ["location", "age_group"] tdp.time_col = "date", tdp.obs_col = "value" diff --git a/tests/postpredict/dependence/test_schaake_build_templates.py b/tests/postpredict/dependence/test_schaake_build_templates.py index aef8acc..89ebd27 100644 --- a/tests/postpredict/dependence/test_schaake_build_templates.py +++ b/tests/postpredict/dependence/test_schaake_build_templates.py @@ -9,7 +9,7 @@ def test_schaake_build_templates_equal_weights(obs_data, wide_model_out): ss = Schaake() - ss.df = obs_data + ss.target_data_train = obs_data ss.key_cols = ["location", "age_group"] ss.time_col = "date", ss.obs_col = "value" @@ -50,7 +50,7 @@ def get_weights(self, train_X, test_X): return weights ss = Schaake(weighter = PopSizeWeighter()) - ss.df = obs_data + ss.target_data_train = obs_data ss.key_cols = ["location", "age_group"] ss.time_col = "date", ss.obs_col = "value" @@ -109,7 +109,7 @@ def get_weights(self, train_X, test_X): def test_schaake_build_templates_reproducible(obs_data, wide_model_out): ss = Schaake(rng = np.random.default_rng(42)) - ss.df = obs_data + ss.target_data_train = obs_data ss.key_cols = ["location", "age_group"] ss.time_col = "date", ss.obs_col = "value" @@ -119,7 +119,7 @@ def test_schaake_build_templates_reproducible(obs_data, wide_model_out): templates_1 = ss._build_templates(pl.concat([wide_model_out] * n_times)) ss = Schaake(rng = np.random.default_rng(42)) - ss.df = obs_data + ss.target_data_train = obs_data ss.key_cols = ["location", "age_group"] ss.time_col = "date", ss.obs_col = "value" diff --git a/tests/postpredict/dependence/test_transform.py b/tests/postpredict/dependence/test_transform.py index 29e5896..dabacb2 100644 --- a/tests/postpredict/dependence/test_transform.py +++ b/tests/postpredict/dependence/test_transform.py @@ -25,20 +25,19 @@ def _build_templates(self, wide_model_out): return pl.DataFrame(templates) tdp = TestPostprocessor(rng = np.random.default_rng(42)) - tdp.df = obs_data + tdp.target_data_train = obs_data + tdp.model_out_train = None tdp.key_cols = ["location", "age_group"] tdp.time_col = "date", tdp.obs_col = "value" tdp.feat_cols = ["location", "age_group", "population"] + tdp.reference_time_col="reference_date" + tdp.horizon_col="horizon" + tdp.pred_col="value" + tdp.idx_col="output_type_id" # perform the transform operation - actual_final = tdp.transform( - model_out=long_model_out, - reference_time_col="reference_date", - horizon_col="horizon", - pred_col="value", - idx_col="output_type_id" - ) + actual_final = tdp.transform(model_out=long_model_out) # Challenge: tdp.transform performs the transformation on the # groups defined by combinations of location and age_group in a random