Skip to content

Commit

Permalink
rename self.df, add training set model outputs as argument to fit method
Browse files Browse the repository at this point in the history
  • Loading branch information
elray1 committed Oct 23, 2024
1 parent 24b4d8b commit df4f1b3
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 59 deletions.
118 changes: 77 additions & 41 deletions src/postpredict/dependence.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ def _build_templates(self, wide_model_out):


def transform(self, model_out: pl.DataFrame,
reference_time_col: str = "reference_date",
horizon_col: str = "horizon", pred_col: str = "value",
idx_col: str = "output_type_id",
obs_mask: np.ndarray | None = None,
pit_templates: bool = False,
return_long_format: bool = True):
Expand All @@ -58,21 +55,12 @@ def transform(self, model_out: pl.DataFrame,
model_out: pl.DataFrame
polars dataframe with sample predictions that do not necessarily
capture temporal dependence.
reference_time_col: str
name of column in model_out that records the reference time for
predictions
horizon_col: str
name of column in model_out that records the prediction horizon
pred_col: str
name of column in model_out with predicted values (samples)
idx_col: str
name of column in model_out with sample indices
obs_mask: np.ndarray | None
mask to use for observed data. The primary use case is to support
cross-validation. If None, all observed data are used to form
dependence templates. Otherwise, `obs_mask` should be a boolean
array of shape (self.df.shape[0], ). Rows of self.df where obs_mask
is True will be used, while rows of self.df where obs_mask is False
array of shape (self.target_data_train.shape[0], ). Rows of self.target_data_train where obs_mask
is True will be used, while rows of self.target_data_train where obs_mask is False
will not be used.
pit_templates: bool
If False (default), templates are based on observed values. If True,
Expand All @@ -87,41 +75,53 @@ def transform(self, model_out: pl.DataFrame,
they reflect the estimated temporal dependence structure.
"""
# pivot model_out from long to wide format
wide_model_out = self._pivot_horizon(model_out, reference_time_col,
horizon_col, idx_col, pred_col)
min_horizon = model_out[horizon_col].min()
max_horizon = model_out[horizon_col].max()
wide_model_out = self._pivot_horizon(
model_out, self.reference_time_col,
self.horizon_col, self.idx_col, self.pred_col
)
min_horizon = model_out[self.horizon_col].min()
max_horizon = model_out[self.horizon_col].max()

# extract train_X and train_Y from observed data (self.df)
if self.model_out_train is not None:
wide_model_out_train = self._pivot_horizon(
self.model_out_train, self.reference_time_col,
self.horizon_col, self.idx_col, self.pred_col
)
else:
wide_model_out_train = None

# extract train_X and train_Y from observed data (self.target_data_train)
# and/or past forecasts (wide_model_out_train)
self._build_train_X_Y(min_horizon, max_horizon, obs_mask,
wide_model_out, reference_time_col, pit_templates)
wide_model_out_train, self.reference_time_col,
pit_templates)

# perform the transformation, one group at a time
transformed_wide_model_out = (
wide_model_out
.group_by(*(self.key_cols + [reference_time_col]))
.group_by(*(self.key_cols + [self.reference_time_col]))
.map_groups(self._transform_one_group)
)

if not return_long_format:
return transformed_wide_model_out

# unpivot back to long format
pivot_index = [c for c in model_out.columns if c not in [horizon_col, pred_col]]
pivot_index = [c for c in model_out.columns if c not in [self.horizon_col, self.pred_col]]
transformed_model_out = (
transformed_wide_model_out
.unpivot(
index = pivot_index,
on = self.wide_horizon_cols,
variable_name = horizon_col,
value_name = pred_col
variable_name = self.horizon_col,
value_name = self.pred_col
)
.with_columns(
# convert horizon columns back to original values and data type
# this is inverting an operation that was done in _pivot_horizon just before the pivot
pl.col(horizon_col)
.str.slice(len("postpredict_") + len(horizon_col), None) # keep everything after f"postpredict_{horizon_col}" prefix
.cast(model_out[horizon_col].dtype)
pl.col(self.horizon_col)
.str.slice(len("postpredict_") + len(self.horizon_col), None) # keep everything after f"postpredict_{horizon_col}" prefix
.cast(model_out[self.horizon_col].dtype)
)
)

Expand Down Expand Up @@ -206,8 +206,8 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
mask to use for observed data. The primary use case is to support
cross-validation. If None, all observed data are used to form
dependence templates. Otherwise, `obs_mask` should be a boolean
array of shape (self.df.shape[0], ). Rows of self.df where obs_mask
is True will be used, while rows of self.df where obs_mask is False
array of shape (self.target_data_train.shape[0], ). Rows of self.target_data_train where obs_mask
is True will be used, while rows of self.target_data_train where obs_mask is False
will not be used.
wide_model_out: pl.DataFrame
polars dataframe with sample predictions that do not necessarily
Expand All @@ -227,9 +227,9 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
Notes
-----
This method sets self.shift_varnames, self.train_X, and self.train_Y,
and it updates self.df to have new columns.
and it updates self.target_data_train to have new columns.
It expects the object to have the properties self.df, self.key_cols,
It expects the object to have the properties self.target_data_train, self.key_cols,
self.time_col, self.obs_col, and self.feat_cols set already.
"""
self.shift_varnames = []
Expand All @@ -241,7 +241,7 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,

if shift_varname not in self.shift_varnames:
self.shift_varnames.append(shift_varname)
self.df = self.df.with_columns(
self.target_data_train = self.target_data_train.with_columns(
pl.col(self.obs_col)
.shift(-h)
.over(self.key_cols, order_by=self.time_col)
Expand All @@ -250,7 +250,9 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,

if obs_mask is None:
obs_mask = True
df_mask_and_dropnull = self.df.filter(obs_mask).drop_nulls()
df_mask_and_dropnull = self.target_data_train.filter(obs_mask).drop_nulls()
print("build train X_Y, df mask and dropnull")
print(df_mask_and_dropnull)

if pit_templates:
pit_values = (
Expand All @@ -267,6 +269,8 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
how="left"
)
)
print("pit values, in _build_train_X_Y")
print(pit_values)
train_X_Y_source = pit_values
train_Y_cols = [f"pit_{pred_c}" for pred_c in self.wide_horizon_cols]
else:
Expand Down Expand Up @@ -370,29 +374,61 @@ def __init__(self, weighter=weighters.EqualWeighter(),
super().__init__(rng)


def fit(self, df, key_cols=None, time_col="date", obs_col="value", feat_cols=["date"]):
def fit(self,
target_data_train: pl.DataFrame,
model_out_train: pl.DataFrame,
key_cols: list[str] | None = None,
time_col: str = "date",
obs_col: str = "value",
reference_time_col: str = "reference_date",
horizon_col: str = "horizon",
pred_col: str = "value",
idx_col: str = "output_type_id",
feat_cols: list[str] = ["date"]) -> None:
"""
Fit a Schaake shuffle model for temporal dependence across prediction
horizons. In practice this just involves saving the input arguments for
later use; the Schaake shuffle does not require any parameter estimation.
Parameters
----------
df: polars dataframe with training set observations.
key_cols: names of columns in `df` used to identify observational units,
e.g. location or age group.
time_col: name of column in `df` that contains the time index.
obs_col: name of column in `df` that contains observed values.
feat_cols: names of columns in `df` with features
target_data_train: pl.DataFrame
training set observations of target data.
model_out_train: pl.DataFrame
training set predictions
key_cols: list[str] | None
names of columns in `target_data_train` and `model_out_train` used
to identify observational units, e.g. location or age group.
time_col: str
name of column in `target_data_train` that contains the time index.
obs_col: str
name of column in `target_data_train` that contains observed values.
reference_time_col: str
name of column in `model_out_train` that contains the reference time
for model predictions
horizon_col: str
name of column in `model_out_train` that contains the prediction
horizon relative to the reference time
pred_col: str
name of column in model_out with predicted values (samples)
idx_col: str
name of column in model_out with sample indices
feat_cols: list[str]
names of columns in `target_data_train` and `model_out_train` with features
Returns
-------
None
"""
self.df = df
self.target_data_train = target_data_train
self.model_out_train = model_out_train
self.key_cols = key_cols
self.time_col = time_col
self.obs_col = obs_col
self.reference_time_col = reference_time_col
self.horizon_col = horizon_col
self.pred_col = pred_col
self.idx_col = idx_col
self.feat_cols = feat_cols


Expand Down
12 changes: 6 additions & 6 deletions tests/postpredict/dependence/test_build_train_X_Y.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def test_build_train_X_Y_positive_horizons(obs_data, monkeypatch):
# See https://stackoverflow.com/a/77748100
monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
tdp.df = obs_data
tdp.target_data_train = obs_data
tdp.key_cols = ["location", "age_group"]
tdp.time_col = "date",
tdp.obs_col = "value"
Expand Down Expand Up @@ -47,7 +47,7 @@ def test_build_train_X_Y_nonnegative_horizons(obs_data, monkeypatch):
# See https://stackoverflow.com/a/77748100
monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
tdp.df = obs_data
tdp.target_data_train = obs_data
tdp.key_cols = ["location", "age_group"]
tdp.time_col = "date",
tdp.obs_col = "value"
Expand Down Expand Up @@ -80,7 +80,7 @@ def test_build_train_X_Y_negative_horizons(obs_data, monkeypatch):
# See https://stackoverflow.com/a/77748100
monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
tdp.df = obs_data
tdp.target_data_train = obs_data
tdp.key_cols = ["location", "age_group"]
tdp.time_col = "date",
tdp.obs_col = "value"
Expand Down Expand Up @@ -113,7 +113,7 @@ def test_build_train_X_Y_mask(obs_data, monkeypatch):
# See https://stackoverflow.com/a/77748100
monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
tdp.df = obs_data
tdp.target_data_train = obs_data
tdp.key_cols = ["location", "age_group"]
tdp.time_col = "date",
tdp.obs_col = "value"
Expand Down Expand Up @@ -147,7 +147,7 @@ def test_build_train_X_Y_pit_templates(obs_data, wide_model_out, monkeypatch):
# See https://stackoverflow.com/a/77748100
monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
tdp.df = obs_data
tdp.target_data_train = obs_data
tdp.key_cols = ["location", "age_group"]
tdp.time_col = "date",
tdp.obs_col = "value"
Expand Down Expand Up @@ -193,7 +193,7 @@ def test_build_train_X_Y_pit_templates_mask(obs_data, wide_model_out, monkeypatc
# See https://stackoverflow.com/a/77748100
monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
tdp.df = obs_data
tdp.target_data_train = obs_data
tdp.key_cols = ["location", "age_group"]
tdp.time_col = "date",
tdp.obs_col = "value"
Expand Down
8 changes: 4 additions & 4 deletions tests/postpredict/dependence/test_schaake_build_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

def test_schaake_build_templates_equal_weights(obs_data, wide_model_out):
ss = Schaake()
ss.df = obs_data
ss.target_data_train = obs_data
ss.key_cols = ["location", "age_group"]
ss.time_col = "date",
ss.obs_col = "value"
Expand Down Expand Up @@ -50,7 +50,7 @@ def get_weights(self, train_X, test_X):
return weights

ss = Schaake(weighter = PopSizeWeighter())
ss.df = obs_data
ss.target_data_train = obs_data
ss.key_cols = ["location", "age_group"]
ss.time_col = "date",
ss.obs_col = "value"
Expand Down Expand Up @@ -109,7 +109,7 @@ def get_weights(self, train_X, test_X):

def test_schaake_build_templates_reproducible(obs_data, wide_model_out):
ss = Schaake(rng = np.random.default_rng(42))
ss.df = obs_data
ss.target_data_train = obs_data
ss.key_cols = ["location", "age_group"]
ss.time_col = "date",
ss.obs_col = "value"
Expand All @@ -119,7 +119,7 @@ def test_schaake_build_templates_reproducible(obs_data, wide_model_out):
templates_1 = ss._build_templates(pl.concat([wide_model_out] * n_times))

ss = Schaake(rng = np.random.default_rng(42))
ss.df = obs_data
ss.target_data_train = obs_data
ss.key_cols = ["location", "age_group"]
ss.time_col = "date",
ss.obs_col = "value"
Expand Down
15 changes: 7 additions & 8 deletions tests/postpredict/dependence/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,19 @@ def _build_templates(self, wide_model_out):
return pl.DataFrame(templates)

tdp = TestPostprocessor(rng = np.random.default_rng(42))
tdp.df = obs_data
tdp.target_data_train = obs_data
tdp.model_out_train = None
tdp.key_cols = ["location", "age_group"]
tdp.time_col = "date",
tdp.obs_col = "value"
tdp.feat_cols = ["location", "age_group", "population"]
tdp.reference_time_col="reference_date"
tdp.horizon_col="horizon"
tdp.pred_col="value"
tdp.idx_col="output_type_id"

# perform the transform operation
actual_final = tdp.transform(
model_out=long_model_out,
reference_time_col="reference_date",
horizon_col="horizon",
pred_col="value",
idx_col="output_type_id"
)
actual_final = tdp.transform(model_out=long_model_out)

# Challenge: tdp.transform performs the transformation on the
# groups defined by combinations of location and age_group in a random
Expand Down

0 comments on commit df4f1b3

Please sign in to comment.