From df4f1b396f5ec8856bf2b043ee2c9a512239ea96 Mon Sep 17 00:00:00 2001
From: Evan Ray <elray@umass.edu>
Date: Tue, 22 Oct 2024 20:01:59 -0400
Subject: [PATCH] rename self.df, add training set model outputs as argument to
 fit method

---
 src/postpredict/dependence.py                 | 118 ++++++++++++------
 .../dependence/test_build_train_X_Y.py        |  12 +-
 .../test_schaake_build_templates.py           |   8 +-
 .../postpredict/dependence/test_transform.py  |  15 ++-
 4 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/src/postpredict/dependence.py b/src/postpredict/dependence.py
index 799f7f5..84d685e 100644
--- a/src/postpredict/dependence.py
+++ b/src/postpredict/dependence.py
@@ -43,9 +43,6 @@ def _build_templates(self, wide_model_out):
 
 
     def transform(self, model_out: pl.DataFrame,
-                  reference_time_col: str = "reference_date",
-                  horizon_col: str = "horizon", pred_col: str = "value",
-                  idx_col: str = "output_type_id",
                   obs_mask: np.ndarray | None = None,
                   pit_templates: bool = False,
                   return_long_format: bool = True):
@@ -58,21 +55,12 @@ def transform(self, model_out: pl.DataFrame,
         model_out: pl.DataFrame
             polars dataframe with sample predictions that do not necessarily
             capture temporal dependence.
-        reference_time_col: str
-            name of column in model_out that records the reference time for
-            predictions
-        horizon_col: str
-            name of column in model_out that records the prediction horizon
-        pred_col: str
-            name of column in model_out with predicted values (samples)
-        idx_col: str
-            name of column in model_out with sample indices
         obs_mask: np.ndarray | None
             mask to use for observed data. The primary use case is to support
             cross-validation. If None, all observed data are used to form
             dependence templates. Otherwise, `obs_mask` should be a boolean
-            array of shape (self.df.shape[0], ). Rows of self.df where obs_mask
-            is True will be used, while rows of self.df where obs_mask is False
+            array of shape (self.target_data_train.shape[0], ). Rows of self.target_data_train where obs_mask
+            is True will be used, while rows of self.target_data_train where obs_mask is False
             will not be used.
         pit_templates: bool
             If False (default), templates are based on observed values. If True,
@@ -87,19 +75,31 @@ def transform(self, model_out: pl.DataFrame,
         they reflect the estimated temporal dependence structure.
         """
         # pivot model_out from long to wide format
-        wide_model_out = self._pivot_horizon(model_out, reference_time_col,
-                                             horizon_col, idx_col, pred_col)
-        min_horizon = model_out[horizon_col].min()
-        max_horizon = model_out[horizon_col].max()
+        wide_model_out = self._pivot_horizon(
+            model_out, self.reference_time_col,
+            self.horizon_col, self.idx_col, self.pred_col
+        )
+        min_horizon = model_out[self.horizon_col].min()
+        max_horizon = model_out[self.horizon_col].max()
         
-        # extract train_X and train_Y from observed data (self.df)
+        if self.model_out_train is not None:
+            wide_model_out_train = self._pivot_horizon(
+                self.model_out_train, self.reference_time_col,
+                self.horizon_col, self.idx_col, self.pred_col
+            )
+        else:
+            wide_model_out_train = None
+        
+        # extract train_X and train_Y from observed data (self.target_data_train)
+        # and/or past forecasts (wide_model_out_train)
         self._build_train_X_Y(min_horizon, max_horizon, obs_mask,
-                              wide_model_out, reference_time_col, pit_templates)
+                              wide_model_out_train, self.reference_time_col,
+                              pit_templates)
         
         # perform the transformation, one group at a time
         transformed_wide_model_out = (
             wide_model_out
-            .group_by(*(self.key_cols + [reference_time_col]))
+            .group_by(*(self.key_cols + [self.reference_time_col]))
             .map_groups(self._transform_one_group)
         )
         
@@ -107,21 +107,21 @@ def transform(self, model_out: pl.DataFrame,
             return transformed_wide_model_out
         
         # unpivot back to long format
-        pivot_index = [c for c in model_out.columns if c not in [horizon_col, pred_col]]
+        pivot_index = [c for c in model_out.columns if c not in [self.horizon_col, self.pred_col]]
         transformed_model_out = (
             transformed_wide_model_out
             .unpivot(
                 index = pivot_index,
                 on = self.wide_horizon_cols,
-                variable_name = horizon_col,
-                value_name = pred_col
+                variable_name = self.horizon_col,
+                value_name = self.pred_col
             )
             .with_columns(
                 # convert horizon columns back to original values and data type
                 # this is inverting an operation that was done in _pivot_horizon just before the pivot
-                pl.col(horizon_col)
-                .str.slice(len("postpredict_") + len(horizon_col), None) # keep everything after f"postpredict_{horizon_col}" prefix
-                .cast(model_out[horizon_col].dtype)
+                pl.col(self.horizon_col)
+                .str.slice(len("postpredict_") + len(self.horizon_col), None) # keep everything after f"postpredict_{horizon_col}" prefix
+                .cast(model_out[self.horizon_col].dtype)
             )
         )
         
@@ -206,8 +206,8 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
             mask to use for observed data. The primary use case is to support
             cross-validation. If None, all observed data are used to form
             dependence templates. Otherwise, `obs_mask` should be a boolean
-            array of shape (self.df.shape[0], ). Rows of self.df where obs_mask
-            is True will be used, while rows of self.df where obs_mask is False
+            array of shape (self.target_data_train.shape[0], ). Rows of self.target_data_train where obs_mask
+            is True will be used, while rows of self.target_data_train where obs_mask is False
             will not be used.
         wide_model_out: pl.DataFrame
             polars dataframe with sample predictions that do not necessarily
@@ -227,9 +227,9 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
         Notes
         -----
         This method sets self.shift_varnames, self.train_X, and self.train_Y,
-        and it updates self.df to have new columns.
+        and it updates self.target_data_train to have new columns.
         
-        It expects the object to have the properties self.df, self.key_cols,
+        It expects the object to have the properties self.target_data_train, self.key_cols,
         self.time_col, self.obs_col, and self.feat_cols set already.
         """
         self.shift_varnames = []
@@ -241,7 +241,7 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
             
             if shift_varname not in self.shift_varnames:
                 self.shift_varnames.append(shift_varname)
-                self.df = self.df.with_columns(
+                self.target_data_train = self.target_data_train.with_columns(
                     pl.col(self.obs_col)
                     .shift(-h)
                     .over(self.key_cols, order_by=self.time_col)
@@ -250,7 +250,9 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
         
         if obs_mask is None:
             obs_mask = True
-        df_mask_and_dropnull = self.df.filter(obs_mask).drop_nulls()
+        df_mask_and_dropnull = self.target_data_train.filter(obs_mask).drop_nulls()
+        print("build train X_Y, df mask and dropnull")
+        print(df_mask_and_dropnull)
 
         if pit_templates:
             pit_values = (
@@ -267,6 +269,8 @@ def _build_train_X_Y(self, min_horizon: int, max_horizon: int,
                     how="left"
                 )
             )
+            print("pit values, in _build_train_X_Y")
+            print(pit_values)
             train_X_Y_source = pit_values
             train_Y_cols = [f"pit_{pred_c}" for pred_c in self.wide_horizon_cols]
         else:
@@ -370,7 +374,17 @@ def __init__(self, weighter=weighters.EqualWeighter(),
         super().__init__(rng)
 
 
-    def fit(self, df, key_cols=None, time_col="date", obs_col="value", feat_cols=["date"]):
+    def fit(self,
+            target_data_train: pl.DataFrame,
+            model_out_train: pl.DataFrame,
+            key_cols: list[str] | None = None,
+            time_col: str = "date",
+            obs_col: str = "value",
+            reference_time_col: str = "reference_date",
+            horizon_col: str = "horizon",
+            pred_col: str = "value",
+            idx_col: str = "output_type_id",
+            feat_cols: list[str] = ["date"]) -> None:
         """
         Fit a Schaake shuffle model for temporal dependence across prediction
         horizons. In practice this just involves saving the input arguments for
@@ -378,21 +392,43 @@ def fit(self, df, key_cols=None, time_col="date", obs_col="value", feat_cols=["d
         
         Parameters
         ----------
-        df: polars dataframe with training set observations.
-        key_cols: names of columns in `df` used to identify observational units,
-        e.g. location or age group.
-        time_col: name of column in `df` that contains the time index.
-        obs_col: name of column in `df` that contains observed values.
-        feat_cols: names of columns in `df` with features
+        target_data_train: pl.DataFrame
+            training set observations of target data.
+        model_out_train: pl.DataFrame
+            training set predictions
+        key_cols: list[str] | None
+            names of columns in `target_data_train` and `model_out_train` used
+            to identify observational units, e.g. location or age group.
+        time_col: str
+            name of column in `target_data_train` that contains the time index.
+        obs_col: str
+            name of column in `target_data_train` that contains observed values.
+        reference_time_col: str
+            name of column in `model_out_train` that contains the reference time
+            for model predictions
+        horizon_col: str
+            name of column in `model_out_train` that contains the prediction
+            horizon relative to the reference time
+        pred_col: str
+            name of column in model_out with predicted values (samples)
+        idx_col: str
+            name of column in model_out with sample indices
+        feat_cols: list[str]
+            names of columns in `target_data_train` and `model_out_train` with features
         
         Returns
         -------
         None
         """
-        self.df = df
+        self.target_data_train = target_data_train
+        self.model_out_train = model_out_train
         self.key_cols = key_cols
         self.time_col = time_col
         self.obs_col = obs_col
+        self.reference_time_col = reference_time_col
+        self.horizon_col = horizon_col
+        self.pred_col = pred_col
+        self.idx_col = idx_col
         self.feat_cols = feat_cols
     
     
diff --git a/tests/postpredict/dependence/test_build_train_X_Y.py b/tests/postpredict/dependence/test_build_train_X_Y.py
index 88b82e8..27ef03c 100644
--- a/tests/postpredict/dependence/test_build_train_X_Y.py
+++ b/tests/postpredict/dependence/test_build_train_X_Y.py
@@ -15,7 +15,7 @@ def test_build_train_X_Y_positive_horizons(obs_data, monkeypatch):
     # See https://stackoverflow.com/a/77748100
     monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
     tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
-    tdp.df = obs_data
+    tdp.target_data_train = obs_data
     tdp.key_cols = ["location", "age_group"]
     tdp.time_col = "date",
     tdp.obs_col = "value"
@@ -47,7 +47,7 @@ def test_build_train_X_Y_nonnegative_horizons(obs_data, monkeypatch):
     # See https://stackoverflow.com/a/77748100
     monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
     tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
-    tdp.df = obs_data
+    tdp.target_data_train = obs_data
     tdp.key_cols = ["location", "age_group"]
     tdp.time_col = "date",
     tdp.obs_col = "value"
@@ -80,7 +80,7 @@ def test_build_train_X_Y_negative_horizons(obs_data, monkeypatch):
     # See https://stackoverflow.com/a/77748100
     monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
     tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
-    tdp.df = obs_data
+    tdp.target_data_train = obs_data
     tdp.key_cols = ["location", "age_group"]
     tdp.time_col = "date",
     tdp.obs_col = "value"
@@ -113,7 +113,7 @@ def test_build_train_X_Y_mask(obs_data, monkeypatch):
     # See https://stackoverflow.com/a/77748100
     monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
     tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
-    tdp.df = obs_data
+    tdp.target_data_train = obs_data
     tdp.key_cols = ["location", "age_group"]
     tdp.time_col = "date",
     tdp.obs_col = "value"
@@ -147,7 +147,7 @@ def test_build_train_X_Y_pit_templates(obs_data, wide_model_out, monkeypatch):
     # See https://stackoverflow.com/a/77748100
     monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
     tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
-    tdp.df = obs_data
+    tdp.target_data_train = obs_data
     tdp.key_cols = ["location", "age_group"]
     tdp.time_col = "date",
     tdp.obs_col = "value"
@@ -193,7 +193,7 @@ def test_build_train_X_Y_pit_templates_mask(obs_data, wide_model_out, monkeypatc
     # See https://stackoverflow.com/a/77748100
     monkeypatch.setattr(TimeDependencePostprocessor, "__abstractmethods__", set())
     tdp = TimeDependencePostprocessor(rng = np.random.default_rng(42))
-    tdp.df = obs_data
+    tdp.target_data_train = obs_data
     tdp.key_cols = ["location", "age_group"]
     tdp.time_col = "date",
     tdp.obs_col = "value"
diff --git a/tests/postpredict/dependence/test_schaake_build_templates.py b/tests/postpredict/dependence/test_schaake_build_templates.py
index aef8acc..89ebd27 100644
--- a/tests/postpredict/dependence/test_schaake_build_templates.py
+++ b/tests/postpredict/dependence/test_schaake_build_templates.py
@@ -9,7 +9,7 @@
 
 def test_schaake_build_templates_equal_weights(obs_data, wide_model_out):
     ss = Schaake()
-    ss.df = obs_data
+    ss.target_data_train = obs_data
     ss.key_cols = ["location", "age_group"]
     ss.time_col = "date",
     ss.obs_col = "value"
@@ -50,7 +50,7 @@ def get_weights(self, train_X, test_X):
             return weights
     
     ss = Schaake(weighter = PopSizeWeighter())
-    ss.df = obs_data
+    ss.target_data_train = obs_data
     ss.key_cols = ["location", "age_group"]
     ss.time_col = "date",
     ss.obs_col = "value"
@@ -109,7 +109,7 @@ def get_weights(self, train_X, test_X):
 
 def test_schaake_build_templates_reproducible(obs_data, wide_model_out):
     ss = Schaake(rng = np.random.default_rng(42))
-    ss.df = obs_data
+    ss.target_data_train = obs_data
     ss.key_cols = ["location", "age_group"]
     ss.time_col = "date",
     ss.obs_col = "value"
@@ -119,7 +119,7 @@ def test_schaake_build_templates_reproducible(obs_data, wide_model_out):
     templates_1 = ss._build_templates(pl.concat([wide_model_out] * n_times))
 
     ss = Schaake(rng = np.random.default_rng(42))
-    ss.df = obs_data
+    ss.target_data_train = obs_data
     ss.key_cols = ["location", "age_group"]
     ss.time_col = "date",
     ss.obs_col = "value"
diff --git a/tests/postpredict/dependence/test_transform.py b/tests/postpredict/dependence/test_transform.py
index 29e5896..dabacb2 100644
--- a/tests/postpredict/dependence/test_transform.py
+++ b/tests/postpredict/dependence/test_transform.py
@@ -25,20 +25,19 @@ def _build_templates(self, wide_model_out):
             return pl.DataFrame(templates)
     
     tdp = TestPostprocessor(rng = np.random.default_rng(42))
-    tdp.df = obs_data
+    tdp.target_data_train = obs_data
+    tdp.model_out_train = None
     tdp.key_cols = ["location", "age_group"]
     tdp.time_col = "date",
     tdp.obs_col = "value"
     tdp.feat_cols = ["location", "age_group", "population"]
+    tdp.reference_time_col="reference_date"
+    tdp.horizon_col="horizon"
+    tdp.pred_col="value"
+    tdp.idx_col="output_type_id"
     
     # perform the transform operation
-    actual_final = tdp.transform(
-        model_out=long_model_out,
-        reference_time_col="reference_date",
-        horizon_col="horizon",
-        pred_col="value",
-        idx_col="output_type_id"
-    )
+    actual_final = tdp.transform(model_out=long_model_out)
     
     # Challenge: tdp.transform performs the transformation on the
     # groups defined by combinations of location and age_group in a random