washingtonpost · lennybronner · Jul 3, 2024 · Sep 24, 2024 · Sep 25, 2024 · Sep 25, 2024
@@ -29,21 +29,32 @@ class LinearSolver(ABC):
 
     def __init__(self):
         self.coefficients = None
+        self.rng = np.random.default_rng(seed=0)
 
     @classmethod
-    def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, *kwargs):
+    def fit(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        weights: np.ndarray | None = None,
+        lambda_: float = 0.0,
+        cache: bool = True,
+        **kwargs,
+    ):
         """
         Fits model
         """
         raise NotImplementedError
 
-    def predict(self, x: np.ndarray) -> np.ndarray:
+    def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray:
         """
         Use coefficients to predict
         """
         self._check_any_element_nan_or_inf(x)
 
-        return x @ self.coefficients
+        if coefficients is None:
+            return x @ self.coefficients
+        return x @ coefficients
 
     def get_coefficients(self) -> np.ndarray:
         """
@@ -77,3 +88,65 @@ def _check_intercept(self, x):
         """
         if ~np.all(x[:, 0] == 1):
             warnings.warn("Warning: fit_intercept=True and not all elements of the first columns are 1s")
+
+    def residuals(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        weights: np.ndarray | None = None,
+        K: int | None = None,
+        center: bool = True,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Computes residuals for the model
+        """
+        if K is None:
+            # when K is None we are just using the training residuals
+            y_hat = self.predict(x).reshape(*y.shape)
+            residuals = y - y_hat
+        else:
+            if weights is None:
+                weights = np.ones_like(y)
+
+            # create indices for k-fold crossfiting
+            indices = np.arange(x.shape[0])
+            # shuffle for random order of datapoints
+            self.rng.shuffle(indices)
+            x_shuffled, y_shuffled, weights_shuffled = x[indices], y[indices], weights[indices]
+
+            # create folds
+            x_folds = np.array_split(x_shuffled, K)
+            y_folds = np.array_split(y_shuffled, K)
+            weights_folds = np.array_split(weights_shuffled, K)
+
+            residuals = []
+            for k in range(K):
+                # extract test points
+                x_test, y_test, _, = (
+                    x_folds[k],
+                    y_folds[k],
+                    weights_folds[k],
+                )
+
+                # extract training points
+                x_train = np.concatenate([x_folds[j] for j in range(K) if j != k])
+                y_train = np.concatenate([y_folds[j] for j in range(K) if j != k])
+                weights_train = np.concatenate([weights_folds[j] for j in range(K) if j != k])
+
+                # fit k-th model
+                coefficients_k = self.fit(x_train, y_train, weights=weights_train, cache=False, **kwargs)
+                y_hat_k = self.predict(x_test, coefficients=coefficients_k)
+
+                # k-th residuals
+                residuals_k = y_test - y_hat_k
+                residuals.append(residuals_k)
+
+            residuals = np.concatenate(residuals)
+            # undo shuffling of residuals, to put them in the original dataset order
+            residuals = residuals[np.argsort(indices)]
+
+        if center:
+            residuals -= np.mean(residuals, axis=0)
+
+        return residuals
@@ -92,6 +92,7 @@ def fit(
         fit_intercept: bool = True,
         regularize_intercept: bool = False,
         n_feat_ignore_reg: int = 0,
+        cache: bool = True,
     ):
         self._check_any_element_nan_or_inf(x)
         self._check_any_element_nan_or_inf(y)
@@ -111,33 +112,45 @@ def fit(
         # in the bootstrap setting we can now pass in the normal equations and can
         # save time re-computing them
         if normal_eqs is None:
-            self.normal_eqs = self._compute_normal_equations(
+            normal_eqs = self._compute_normal_equations(
                 x, L, lambda_, fit_intercept, regularize_intercept, n_feat_ignore_reg
             )
-        else:
-            self.normal_eqs = normal_eqs
 
         # compute hat matrix: X (X^T X)^{-1} X^T
-        self.hat_vals = np.diag(x @ self.normal_eqs @ L)
-
+        hat_vals = np.diag(x @ normal_eqs @ L)
         # compute coefficients: (X^T X)^{-1} X^T y
-        self.coefficients = self.normal_eqs @ L @ y
+        coefficients = normal_eqs @ L @ y
 
-    def residuals(self, y: np.ndarray, y_hat: np.ndarray, loo: bool = True, center: bool = True) -> np.ndarray:
-        """
-        Computes residuals for the model
-        """
-        # compute standard residuals
-        residuals = y - y_hat
+        if cache:
+            self.normal_eqs = normal_eqs
+            self.hat_vals = hat_vals
+            self.coefficients = coefficients
+
+        return coefficients
 
-        # if leave one out is True, inflate by (1 - P)
-        # in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals
-        # the un-inflated training residuals are too small, since training covariates were observed during fitting
-        if loo:
+    def residuals(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        weights: np.ndarray | None = None,
+        K: int | None = None,
+        center: bool = True,
+        **kwargs
+    ) -> np.ndarray:
+        if K == x.shape[0]:
+            # compute standard residuals
+            y_hat = self.predict(x)
+            residuals = y - y_hat
+
+            # inflate by (1 - P)
+            # in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals
+            # the un-inflated training residuals are too small, since training covariates were observed during fitting
             residuals /= (1 - self.hat_vals).reshape(-1, 1)
 
-        # centering removes the column mean
-        if center:
-            residuals -= np.mean(residuals, axis=0)
+            # centering removes the column mean
+            if center:
+                residuals -= np.mean(residuals, axis=0)
+        else:
+            residuals = super().residuals(x, y, weights, K, center, **kwargs)
 
         return residuals
@@ -31,6 +31,7 @@ def _fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray, tau: float) ->
         # b_eq is the equality constraint vector (ie. A_eq @ x = b_eq)
         # bounds are the (min, max) possible values of every element of x
         res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method="highs", options={"presolve": False})
+
         # marginal are the dual values, since we are solving the dual this is equivalent to the primal
         return -1 * res.eqlin.marginals
 
@@ -84,6 +85,7 @@ def fit(
         regularize_intercept: bool = False,
         n_feat_ignore_reg: int = 0,
         normalize_weights: bool = True,
+        cache: bool = True,
     ):
         """
         Fits quantile regression
@@ -105,23 +107,40 @@ def fit(
                 raise ZeroDivisionError
             weights = weights / weights_sum
 
+        if y.ndim == 1:  # code expects 2-dim array
+            y = y.reshape(-1, 1)
+
         # _fit assumes that taus is list, so if we want to do one value of tau then turn into a list
         if isinstance(taus, float):
             taus = [taus]
-
+        else:
+            assert y.shape[1] == 1  # you can either have multiple taus or multiple ys
+        coefficients_array = []
         for tau in taus:
-            if lambda_ > 0:
-                coefficients = self._fit_with_regularization(
-                    x, y, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg
-                )
-            else:
-                coefficients = self._fit(x, y, weights, tau)
-            self.coefficients.append(coefficients)
-
-    def predict(self, x: np.ndarray) -> np.ndarray:
+            for y_arr in y.T:
+                if lambda_ > 0:
+                    coefficients = self._fit_with_regularization(
+                        x, y_arr, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg
+                    )
+                else:
+                    coefficients = self._fit(x, y_arr, weights, tau)
+                coefficients_array.append(coefficients)
+
+        coefficients_array = np.asarray(coefficients_array).T
+        if cache:
+            self.coefficients = coefficients_array
+
+        return coefficients_array
+
+    def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray:
         """
         Use coefficients to predict
         """
         self._check_any_element_nan_or_inf(x)
 
-        return self.coefficients @ x.T
+        if coefficients is None:
+            preds = x @ self.coefficients
+        else:
+            preds = x @ coefficients
+
+        return preds
@@ -3,6 +3,7 @@
 import os
 import sys
 
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -40,3 +41,9 @@ def random_data_no_weights(get_fixture):
 @pytest.fixture(scope="session")
 def random_data_weights(get_fixture):
     return get_fixture("random_data_n100_p5_12549_weights.csv")
+
+
+@pytest.fixture(scope="session")
+def rng():
+    seed = 8232
+    return np.random.default_rng(seed=seed)
@@ -2,9 +2,27 @@
 import pytest
 
 from elexsolver.LinearSolver import LinearSolver
+from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver
 
 
 def test_fit():
     solver = LinearSolver()
     with pytest.raises(NotImplementedError):
         solver.fit(np.ndarray((5, 3)), np.ndarray((1, 3)))
+
+
+##################
+# Test residuals #
+##################
+def test_residuals_without_weights(rng):
+    x = rng.normal(size=(100, 5))
+    beta = rng.normal(size=(5, 1))
+    y = x @ beta
+
+    # we need an a subclass of LinearSolver to actually run a fit
+    reg = QuantileRegressionSolver()
+    reg.fit(x, y, fit_intercept=False)
+    reg.predict(x)
+
+    reg.residuals(x, y, K=None, center=False)
+    reg.residuals(x, y, K=10, center=False)
@@ -31,6 +31,22 @@ def test_basic2():
     assert all(np.abs(preds - [6.666667, 6.666667, 6.666667, 15]) <= TOL)
 
 
+def test_cache():
+    lm = OLSRegressionSolver()
+    x = np.asarray([[1, 1], [1, 1], [1, 1], [1, 2]])
+    y = np.asarray([3, 8, 9, 15])
+    lm.fit(x, y, fit_intercept=True, cache=False)
+
+    assert lm.normal_eqs is None
+    assert lm.hat_vals is None
+
+    lm.fit(x, y, fit_intercept=True, cache=True)
+
+    assert lm.normal_eqs is not None
+    assert lm.hat_vals is not None
+    assert lm.coefficients is not None
+
+
 ######################
 # Intermediate tests #
 ######################
@@ -209,19 +225,19 @@ def test_residuals_no_weights(random_data_no_weights):
     x = random_data_no_weights[["x0", "x1", "x2", "x3", "x4"]].values
     y = random_data_no_weights["y"].values.reshape(-1, 1)
     lm.fit(x, y, fit_intercept=False)
-    predictions = lm.predict(x)
-
-    residuals = lm.residuals(y, predictions, loo=False, center=False)
+    lm.predict(x)
 
+    residuals = lm.residuals(x, y, K=None, center=False)
     assert residuals[0] == pytest.approx(0.885973530)
     assert residuals[-1] == pytest.approx(0.841996302)
 
-    residuals = lm.residuals(y, predictions, loo=True, center=False)
+    # equivalent of leave-one-out residuals
+    residuals = lm.residuals(x, y, K=100, center=False)
 
     assert residuals[0] == pytest.approx(0.920112164)
     assert residuals[-1] == pytest.approx(0.875896477)
 
-    residuals = lm.residuals(y, predictions, loo=True, center=True)
+    residuals = lm.residuals(x, y, K=100, center=True)
     assert np.sum(residuals) == pytest.approx(0)
 
 
@@ -232,19 +248,19 @@ def test_residuals_weights(random_data_weights):
     weights = random_data_weights["weights"].values
 
     lm.fit(x, y, weights=weights, fit_intercept=False)
-    predictions = lm.predict(x)
+    lm.predict(x)
 
-    residuals = lm.residuals(y, predictions, loo=False, center=False)
+    residuals = lm.residuals(x, y, weights=weights, K=None, center=False)
 
     assert residuals[0] == pytest.approx(-1.971798590)
     assert residuals[-1] == pytest.approx(-1.373951578)
 
-    residuals = lm.residuals(y, predictions, loo=True, center=False)
+    residuals = lm.residuals(x, y, weights=weights, K=100, center=False)
 
     assert residuals[0] == pytest.approx(-1.999718445)
     assert residuals[-1] == pytest.approx(-1.438563033)
 
-    residuals = lm.residuals(y, predictions, loo=True, center=True)
+    residuals = lm.residuals(x, y, k=100, center=True)
     assert np.sum(residuals) == pytest.approx(0)