diff --git a/src/elexsolver/LinearSolver.py b/src/elexsolver/LinearSolver.py index 949cb5b..fe39321 100644 --- a/src/elexsolver/LinearSolver.py +++ b/src/elexsolver/LinearSolver.py @@ -29,21 +29,32 @@ class LinearSolver(ABC): def __init__(self): self.coefficients = None + self.rng = np.random.default_rng(seed=0) @classmethod - def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, *kwargs): + def fit( + self, + x: np.ndarray, + y: np.ndarray, + weights: np.ndarray | None = None, + lambda_: float = 0.0, + cache: bool = True, + **kwargs, + ): """ Fits model """ raise NotImplementedError - def predict(self, x: np.ndarray) -> np.ndarray: + def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray: """ Use coefficients to predict """ self._check_any_element_nan_or_inf(x) - return x @ self.coefficients + if coefficients is None: + return x @ self.coefficients + return x @ coefficients def get_coefficients(self) -> np.ndarray: """ @@ -77,3 +88,65 @@ def _check_intercept(self, x): """ if ~np.all(x[:, 0] == 1): warnings.warn("Warning: fit_intercept=True and not all elements of the first columns are 1s") + + def residuals( + self, + x: np.ndarray, + y: np.ndarray, + weights: np.ndarray | None = None, + K: int | None = None, + center: bool = True, + **kwargs, + ) -> np.ndarray: + """ + Computes residuals for the model + """ + if K is None: + # when K is None we are just using the training residuals + y_hat = self.predict(x).reshape(*y.shape) + residuals = y - y_hat + else: + if weights is None: + weights = np.ones_like(y) + + # create indices for k-fold crossfiting + indices = np.arange(x.shape[0]) + # shuffle for random order of datapoints + self.rng.shuffle(indices) + x_shuffled, y_shuffled, weights_shuffled = x[indices], y[indices], weights[indices] + + # create folds + x_folds = np.array_split(x_shuffled, K) + y_folds = np.array_split(y_shuffled, K) + weights_folds = np.array_split(weights_shuffled, K) + + residuals = [] + for k in range(K): + # extract test points + x_test, y_test, _, = ( + x_folds[k], + y_folds[k], + weights_folds[k], + ) + + # extract training points + x_train = np.concatenate([x_folds[j] for j in range(K) if j != k]) + y_train = np.concatenate([y_folds[j] for j in range(K) if j != k]) + weights_train = np.concatenate([weights_folds[j] for j in range(K) if j != k]) + + # fit k-th model + coefficients_k = self.fit(x_train, y_train, weights=weights_train, cache=False, **kwargs) + y_hat_k = self.predict(x_test, coefficients=coefficients_k) + + # k-th residuals + residuals_k = y_test - y_hat_k + residuals.append(residuals_k) + + residuals = np.concatenate(residuals) + # undo shuffling of residuals, to put them in the original dataset order + residuals = residuals[np.argsort(indices)] + + if center: + residuals -= np.mean(residuals, axis=0) + + return residuals diff --git a/src/elexsolver/OLSRegressionSolver.py b/src/elexsolver/OLSRegressionSolver.py index ca05535..68d6820 100644 --- a/src/elexsolver/OLSRegressionSolver.py +++ b/src/elexsolver/OLSRegressionSolver.py @@ -92,6 +92,7 @@ def fit( fit_intercept: bool = True, regularize_intercept: bool = False, n_feat_ignore_reg: int = 0, + cache: bool = True, ): self._check_any_element_nan_or_inf(x) self._check_any_element_nan_or_inf(y) @@ -111,33 +112,45 @@ def fit( # in the bootstrap setting we can now pass in the normal equations and can # save time re-computing them if normal_eqs is None: - self.normal_eqs = self._compute_normal_equations( + normal_eqs = self._compute_normal_equations( x, L, lambda_, fit_intercept, regularize_intercept, n_feat_ignore_reg ) - else: - self.normal_eqs = normal_eqs # compute hat matrix: X (X^T X)^{-1} X^T - self.hat_vals = np.diag(x @ self.normal_eqs @ L) - + hat_vals = np.diag(x @ normal_eqs @ L) # compute coefficients: (X^T X)^{-1} X^T y - self.coefficients = self.normal_eqs @ L @ y + coefficients = normal_eqs @ L @ y - def residuals(self, y: np.ndarray, y_hat: np.ndarray, loo: bool = True, center: bool = True) -> np.ndarray: - """ - Computes residuals for the model - """ - # compute standard residuals - residuals = y - y_hat + if cache: + self.normal_eqs = normal_eqs + self.hat_vals = hat_vals + self.coefficients = coefficients + + return coefficients - # if leave one out is True, inflate by (1 - P) - # in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals - # the un-inflated training residuals are too small, since training covariates were observed during fitting - if loo: + def residuals( + self, + x: np.ndarray, + y: np.ndarray, + weights: np.ndarray | None = None, + K: int | None = None, + center: bool = True, + **kwargs + ) -> np.ndarray: + if K == x.shape[0]: + # compute standard residuals + y_hat = self.predict(x) + residuals = y - y_hat + + # inflate by (1 - P) + # in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals + # the un-inflated training residuals are too small, since training covariates were observed during fitting residuals /= (1 - self.hat_vals).reshape(-1, 1) - # centering removes the column mean - if center: - residuals -= np.mean(residuals, axis=0) + # centering removes the column mean + if center: + residuals -= np.mean(residuals, axis=0) + else: + residuals = super().residuals(x, y, weights, K, center, **kwargs) return residuals diff --git a/src/elexsolver/QuantileRegressionSolver.py b/src/elexsolver/QuantileRegressionSolver.py index 6654607..ce4476e 100644 --- a/src/elexsolver/QuantileRegressionSolver.py +++ b/src/elexsolver/QuantileRegressionSolver.py @@ -31,6 +31,7 @@ def _fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray, tau: float) -> # b_eq is the equality constraint vector (ie. A_eq @ x = b_eq) # bounds are the (min, max) possible values of every element of x res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method="highs", options={"presolve": False}) + # marginal are the dual values, since we are solving the dual this is equivalent to the primal return -1 * res.eqlin.marginals @@ -84,6 +85,7 @@ def fit( regularize_intercept: bool = False, n_feat_ignore_reg: int = 0, normalize_weights: bool = True, + cache: bool = True, ): """ Fits quantile regression @@ -105,23 +107,40 @@ def fit( raise ZeroDivisionError weights = weights / weights_sum + if y.ndim == 1: # code expects 2-dim array + y = y.reshape(-1, 1) + # _fit assumes that taus is list, so if we want to do one value of tau then turn into a list if isinstance(taus, float): taus = [taus] - + else: + assert y.shape[1] == 1 # you can either have multiple taus or multiple ys + coefficients_array = [] for tau in taus: - if lambda_ > 0: - coefficients = self._fit_with_regularization( - x, y, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg - ) - else: - coefficients = self._fit(x, y, weights, tau) - self.coefficients.append(coefficients) - - def predict(self, x: np.ndarray) -> np.ndarray: + for y_arr in y.T: + if lambda_ > 0: + coefficients = self._fit_with_regularization( + x, y_arr, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg + ) + else: + coefficients = self._fit(x, y_arr, weights, tau) + coefficients_array.append(coefficients) + + coefficients_array = np.asarray(coefficients_array).T + if cache: + self.coefficients = coefficients_array + + return coefficients_array + + def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray: """ Use coefficients to predict """ self._check_any_element_nan_or_inf(x) - return self.coefficients @ x.T + if coefficients is None: + preds = x @ self.coefficients + else: + preds = x @ coefficients + + return preds diff --git a/tests/conftest.py b/tests/conftest.py index ca27468..7c81115 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import os import sys +import numpy as np import pandas as pd import pytest @@ -40,3 +41,9 @@ def random_data_no_weights(get_fixture): @pytest.fixture(scope="session") def random_data_weights(get_fixture): return get_fixture("random_data_n100_p5_12549_weights.csv") + + +@pytest.fixture(scope="session") +def rng(): + seed = 8232 + return np.random.default_rng(seed=seed) diff --git a/tests/test_linear_solver.py b/tests/test_linear_solver.py index 98e1f16..f0994a3 100644 --- a/tests/test_linear_solver.py +++ b/tests/test_linear_solver.py @@ -2,9 +2,27 @@ import pytest from elexsolver.LinearSolver import LinearSolver +from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver def test_fit(): solver = LinearSolver() with pytest.raises(NotImplementedError): solver.fit(np.ndarray((5, 3)), np.ndarray((1, 3))) + + +################## +# Test residuals # +################## +def test_residuals_without_weights(rng): + x = rng.normal(size=(100, 5)) + beta = rng.normal(size=(5, 1)) + y = x @ beta + + # we need an a subclass of LinearSolver to actually run a fit + reg = QuantileRegressionSolver() + reg.fit(x, y, fit_intercept=False) + reg.predict(x) + + reg.residuals(x, y, K=None, center=False) + reg.residuals(x, y, K=10, center=False) diff --git a/tests/test_ols.py b/tests/test_ols.py index 935b2e6..66adf62 100644 --- a/tests/test_ols.py +++ b/tests/test_ols.py @@ -31,6 +31,22 @@ def test_basic2(): assert all(np.abs(preds - [6.666667, 6.666667, 6.666667, 15]) <= TOL) +def test_cache(): + lm = OLSRegressionSolver() + x = np.asarray([[1, 1], [1, 1], [1, 1], [1, 2]]) + y = np.asarray([3, 8, 9, 15]) + lm.fit(x, y, fit_intercept=True, cache=False) + + assert lm.normal_eqs is None + assert lm.hat_vals is None + + lm.fit(x, y, fit_intercept=True, cache=True) + + assert lm.normal_eqs is not None + assert lm.hat_vals is not None + assert lm.coefficients is not None + + ###################### # Intermediate tests # ###################### @@ -209,19 +225,19 @@ def test_residuals_no_weights(random_data_no_weights): x = random_data_no_weights[["x0", "x1", "x2", "x3", "x4"]].values y = random_data_no_weights["y"].values.reshape(-1, 1) lm.fit(x, y, fit_intercept=False) - predictions = lm.predict(x) - - residuals = lm.residuals(y, predictions, loo=False, center=False) + lm.predict(x) + residuals = lm.residuals(x, y, K=None, center=False) assert residuals[0] == pytest.approx(0.885973530) assert residuals[-1] == pytest.approx(0.841996302) - residuals = lm.residuals(y, predictions, loo=True, center=False) + # equivalent of leave-one-out residuals + residuals = lm.residuals(x, y, K=100, center=False) assert residuals[0] == pytest.approx(0.920112164) assert residuals[-1] == pytest.approx(0.875896477) - residuals = lm.residuals(y, predictions, loo=True, center=True) + residuals = lm.residuals(x, y, K=100, center=True) assert np.sum(residuals) == pytest.approx(0) @@ -232,19 +248,19 @@ def test_residuals_weights(random_data_weights): weights = random_data_weights["weights"].values lm.fit(x, y, weights=weights, fit_intercept=False) - predictions = lm.predict(x) + lm.predict(x) - residuals = lm.residuals(y, predictions, loo=False, center=False) + residuals = lm.residuals(x, y, weights=weights, K=None, center=False) assert residuals[0] == pytest.approx(-1.971798590) assert residuals[-1] == pytest.approx(-1.373951578) - residuals = lm.residuals(y, predictions, loo=True, center=False) + residuals = lm.residuals(x, y, weights=weights, K=100, center=False) assert residuals[0] == pytest.approx(-1.999718445) assert residuals[-1] == pytest.approx(-1.438563033) - residuals = lm.residuals(y, predictions, loo=True, center=True) + residuals = lm.residuals(x, y, k=100, center=True) assert np.sum(residuals) == pytest.approx(0) diff --git a/tests/test_quantile.py b/tests/test_quantile.py index 792c6bd..c39e805 100644 --- a/tests/test_quantile.py +++ b/tests/test_quantile.py @@ -23,7 +23,7 @@ def test_basic_median_1(): preds = quantreg.predict(x) # you'd think it would be 8 instead of 7.5, but run quantreg in R to confirm # has to do with missing intercept - np.testing.assert_array_equal(preds, [[7.5, 7.5, 7.5, 15]]) + np.testing.assert_array_equal(preds, [[7.5], [7.5], [7.5], [15]]) def test_basic_median_2(): @@ -33,7 +33,7 @@ def test_basic_median_2(): y = np.asarray([3, 8, 9, 15]) quantreg.fit(x, y, tau) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[8, 8, 8, 15]]) + np.testing.assert_array_equal(preds, [[8], [8], [8], [15]]) def test_basic_lower(): @@ -43,7 +43,7 @@ def test_basic_lower(): y = np.asarray([3, 8, 9, 15]) quantreg.fit(x, y, tau) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[3, 3, 3, 15]]) + np.testing.assert_array_equal(preds, [[3], [3], [3], [15]]) def test_basic_upper(): @@ -53,7 +53,20 @@ def test_basic_upper(): y = np.asarray([3, 8, 9, 15]) quantreg.fit(x, y, tau) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[9, 9, 9, 15]]) + np.testing.assert_array_equal(preds, [[9], [9], [9], [15]]) + + +def test_cache(): + quantreg = QuantileRegressionSolver() + tau = 0.9 + x = np.asarray([[1, 1], [1, 1], [1, 1], [1, 2]]) + y = np.asarray([3, 8, 9, 15]) + quantreg.fit(x, y, tau, cache=False) + + assert quantreg.coefficients == [] + + quantreg.fit(x, y, tau, cache=True) + assert len(quantreg.coefficients) > 0 ###################### @@ -68,7 +81,7 @@ def test_random_median(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, tau, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[1.57699, 6.74906, 4.40175, 4.85346, 4.51814]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[1.57699], [6.74906], [4.40175], [4.85346], [4.51814]], rtol=TOL) def test_random_lower(random_data_no_weights): @@ -78,7 +91,7 @@ def test_random_lower(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, tau, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[0.17759, 6.99588, 4.18896, 4.83906, 3.22546]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[0.17759], [6.99588], [4.18896], [4.83906], [3.22546]], rtol=TOL) def test_random_upper(random_data_no_weights): @@ -88,7 +101,7 @@ def test_random_upper(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, tau, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[1.85617, 6.81286, 6.05586, 5.51965, 4.19864]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[1.85617], [6.81286], [6.05586], [5.51965], [4.19864]], rtol=TOL) ################# @@ -103,10 +116,10 @@ def test_multiple(random_data_no_weights): y = random_data_no_weights["y"].values quantreg.fit(x, y, taus, fit_intercept=False) quantreg.predict(x) - assert len(quantreg.coefficients) == 3 - np.testing.assert_allclose(quantreg.coefficients[0], [0.17759, 6.99588, 4.18896, 4.83906, 3.22546], rtol=TOL) - np.testing.assert_allclose(quantreg.coefficients[1], [1.57699, 6.74906, 4.40175, 4.85346, 4.51814], rtol=TOL) - np.testing.assert_allclose(quantreg.coefficients[2], [1.85617, 6.81286, 6.05586, 5.51965, 4.19864], rtol=TOL) + assert quantreg.coefficients.shape == (5, 3) + np.testing.assert_allclose(quantreg.coefficients[:, 0], [0.17759, 6.99588, 4.18896, 4.83906, 3.22546], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients[:, 1], [1.57699, 6.74906, 4.40175, 4.85346, 4.51814], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients[:, 2], [1.85617, 6.81286, 6.05586, 5.51965, 4.19864], rtol=TOL) ###################### @@ -122,7 +135,7 @@ def test_basic_median_weights(): weights = np.asarray([1, 1, 100, 3]) quantreg.fit(x, y, tau, weights) preds = quantreg.predict(x) - np.testing.assert_array_equal(preds, [[9, 9, 9, 15]]) + np.testing.assert_array_equal(preds, [[9], [9], [9], [15]]) def test_random_median_weights(random_data_weights): @@ -133,7 +146,7 @@ def test_random_median_weights(random_data_weights): weights = random_data_weights["weights"].values quantreg.fit(x, y, tau, weights=weights, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[1.59521, 2.17864, 4.68050, 3.10920, 9.63739]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[1.59521], [2.17864], [4.68050], [3.10920], [9.63739]], rtol=TOL) def test_random_lower_weights(random_data_weights): @@ -144,7 +157,7 @@ def test_random_lower_weights(random_data_weights): weights = random_data_weights["weights"].values quantreg.fit(x, y, tau, weights=weights, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[0.63670, 1.27028, 4.81500, 3.08055, 8.69929]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[0.63670], [1.27028], [4.81500], [3.08055], [8.69929]], rtol=TOL) def test_random_upper_weights(random_data_weights): @@ -155,7 +168,7 @@ def test_random_upper_weights(random_data_weights): weights = random_data_weights["weights"].values quantreg.fit(x, y, tau, weights=weights, fit_intercept=False) quantreg.predict(x) - np.testing.assert_allclose(quantreg.coefficients, [[3.47742, 2.07360, 4.51754, 4.15237, 9.58856]], rtol=TOL) + np.testing.assert_allclose(quantreg.coefficients, [[3.47742], [2.07360], [4.51754], [4.15237], [9.58856]], rtol=TOL) ############################# @@ -189,12 +202,12 @@ def test_weight_normalization_same_fit(random_data_weights): quantreg = QuantileRegressionSolver() quantreg.fit(x, y, tau, weights, normalize_weights=True) preds = quantreg.predict(x) - np.testing.assert_allclose(preds, [[9, 9, 9, 15]], rtol=TOL) + np.testing.assert_allclose(preds, [[9], [9], [9], [15]], rtol=TOL) quantreg = QuantileRegressionSolver() quantreg.fit(x, y, tau, weights, normalize_weights=False) preds = quantreg.predict(x) - np.testing.assert_allclose(preds, [[9, 9, 9, 15]], rtol=TOL) + np.testing.assert_allclose(preds, [[9], [9], [9], [15]], rtol=TOL) ######################## @@ -205,13 +218,13 @@ def test_weight_normalization_same_fit(random_data_weights): def test_regularization_without_intercept(random_data_no_weights): tau = 0.5 x = random_data_no_weights[["x0", "x1", "x2", "x3", "x4"]].values - y = random_data_no_weights["y"].values + y = random_data_no_weights["y"].values.flatten() quantreg = QuantileRegressionSolver() lambda_ = 1e6 quantreg.fit(x, y, tau, lambda_=lambda_, fit_intercept=False, regularize_intercept=True) np.testing.assert_allclose( - quantreg.coefficients, [[0, 0, 0, 0, 0]], atol=TOL + quantreg.coefficients, [[0], [0], [0], [0], [0]], atol=TOL ) # using absolute tolerance since comparing to zero @@ -225,7 +238,7 @@ def test_regularization_with_intercept(random_data_no_weights): lambda_ = 1e6 quantreg.fit(x, y, tau, lambda_=lambda_, fit_intercept=True, regularize_intercept=False) coefficients_w_reg = quantreg.coefficients - np.testing.assert_allclose(quantreg.coefficients[0][1:], [0, 0, 0, 0], atol=TOL) + np.testing.assert_allclose(quantreg.coefficients[1:], [[0], [0], [0], [0]], atol=TOL) assert np.abs(coefficients_w_reg[0][0]) > TOL @@ -239,10 +252,10 @@ def test_regularization_with_intercept_and_features(random_data_no_weights): lambda_ = 1e6 quantreg.fit(x, y, tau, lambda_=lambda_, fit_intercept=True, regularize_intercept=False, n_feat_ignore_reg=2) coefficients_w_reg = quantreg.coefficients - np.testing.assert_allclose(quantreg.coefficients[0][3:], [0, 0], atol=TOL) - assert np.abs(coefficients_w_reg[0][0]) > TOL - assert np.abs(coefficients_w_reg[0][1]) > TOL - assert np.abs(coefficients_w_reg[0][2]) > TOL + np.testing.assert_allclose(quantreg.coefficients[3:], [[0], [0]], atol=TOL) + assert np.abs(coefficients_w_reg[0]) > TOL + assert np.abs(coefficients_w_reg[1]) > TOL + assert np.abs(coefficients_w_reg[2]) > TOL ########################