Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating residuals #27

Open
wants to merge 6 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 76 additions & 3 deletions src/elexsolver/LinearSolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,21 +29,32 @@ class LinearSolver(ABC):

def __init__(self):
self.coefficients = None
self.rng = np.random.default_rng(seed=0)

@classmethod
def fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray | None = None, lambda_: float = 0.0, *kwargs):
def fit(
self,
x: np.ndarray,
y: np.ndarray,
weights: np.ndarray | None = None,
lambda_: float = 0.0,
cache: bool = True,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you help me understand how the cache is used? It seems like there are cases where we want to (not) keep track of certain things computed during the k-fold residuals process, but I'm not sure I understand why or why not. Plus, this looks like it's only used in the subclasses, so I'd suggest we either (a) remove it from here, or (b) add a method in this super-class that's like def cache_things() (bad method name) where the logic for this is used and can be shared by all subclasses 🤔

**kwargs,
):
"""
Fits model
"""
raise NotImplementedError

def predict(self, x: np.ndarray) -> np.ndarray:
def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would you feel about leaving this (public method) like:

def predict(self, x: np.ndarray) -> np.ndarray:
    return _predict(x, self.coefficients)

and adding:

def _predict(self, x: np.ndarray, coefficients: np.ndarray) -> np.ndarray:
    return x @ coefficients

and then in residuals() you call _predict(x_test, coefficients_k) ? The way it's written now invites users to pass in any arbitrary coefficients, which might not be a good idea 🤔

"""
Use coefficients to predict
"""
self._check_any_element_nan_or_inf(x)

return x @ self.coefficients
if coefficients is None:
return x @ self.coefficients
return x @ coefficients

def get_coefficients(self) -> np.ndarray:
"""
Expand Down Expand Up @@ -77,3 +88,65 @@ def _check_intercept(self, x):
"""
if ~np.all(x[:, 0] == 1):
warnings.warn("Warning: fit_intercept=True and not all elements of the first columns are 1s")

def residuals(
self,
x: np.ndarray,
y: np.ndarray,
weights: np.ndarray | None = None,
K: int | None = None,
center: bool = True,
**kwargs,
) -> np.ndarray:
"""
Computes residuals for the model
"""
if K is None:
# when K is None we are just using the training residuals
y_hat = self.predict(x).reshape(*y.shape)
residuals = y - y_hat
else:
if weights is None:
weights = np.ones_like(y)

# create indices for k-fold crossfiting
indices = np.arange(x.shape[0])
# shuffle for random order of datapoints
self.rng.shuffle(indices)
x_shuffled, y_shuffled, weights_shuffled = x[indices], y[indices], weights[indices]

# create folds
x_folds = np.array_split(x_shuffled, K)
y_folds = np.array_split(y_shuffled, K)
weights_folds = np.array_split(weights_shuffled, K)

residuals = []
for k in range(K):
# extract test points
x_test, y_test, _, = (
x_folds[k],
y_folds[k],
weights_folds[k],
)

# extract training points
x_train = np.concatenate([x_folds[j] for j in range(K) if j != k])
y_train = np.concatenate([y_folds[j] for j in range(K) if j != k])
weights_train = np.concatenate([weights_folds[j] for j in range(K) if j != k])

# fit k-th model
coefficients_k = self.fit(x_train, y_train, weights=weights_train, cache=False, **kwargs)
y_hat_k = self.predict(x_test, coefficients=coefficients_k)

# k-th residuals
residuals_k = y_test - y_hat_k
residuals.append(residuals_k)

residuals = np.concatenate(residuals)
# undo shuffling of residuals, to put them in the original dataset order
residuals = residuals[np.argsort(indices)]

if center:
residuals -= np.mean(residuals, axis=0)

return residuals
51 changes: 32 additions & 19 deletions src/elexsolver/OLSRegressionSolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def fit(
fit_intercept: bool = True,
regularize_intercept: bool = False,
n_feat_ignore_reg: int = 0,
cache: bool = True,
):
self._check_any_element_nan_or_inf(x)
self._check_any_element_nan_or_inf(y)
Expand All @@ -111,33 +112,45 @@ def fit(
# in the bootstrap setting we can now pass in the normal equations and can
# save time re-computing them
if normal_eqs is None:
self.normal_eqs = self._compute_normal_equations(
normal_eqs = self._compute_normal_equations(
x, L, lambda_, fit_intercept, regularize_intercept, n_feat_ignore_reg
)
else:
self.normal_eqs = normal_eqs

# compute hat matrix: X (X^T X)^{-1} X^T
self.hat_vals = np.diag(x @ self.normal_eqs @ L)

hat_vals = np.diag(x @ normal_eqs @ L)
# compute coefficients: (X^T X)^{-1} X^T y
self.coefficients = self.normal_eqs @ L @ y
coefficients = normal_eqs @ L @ y

def residuals(self, y: np.ndarray, y_hat: np.ndarray, loo: bool = True, center: bool = True) -> np.ndarray:
"""
Computes residuals for the model
"""
# compute standard residuals
residuals = y - y_hat
if cache:
self.normal_eqs = normal_eqs
self.hat_vals = hat_vals
self.coefficients = coefficients

return coefficients

# if leave one out is True, inflate by (1 - P)
# in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals
# the un-inflated training residuals are too small, since training covariates were observed during fitting
if loo:
def residuals(
self,
x: np.ndarray,
y: np.ndarray,
weights: np.ndarray | None = None,
K: int | None = None,
center: bool = True,
**kwargs
) -> np.ndarray:
if K == x.shape[0]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it possible that any other subclasses would benefit from this logic? 🤔

# compute standard residuals
y_hat = self.predict(x)
residuals = y - y_hat

# inflate by (1 - P)
# in OLS setting inflating by (1 - P) is the same as computing the leave one out residuals
# the un-inflated training residuals are too small, since training covariates were observed during fitting
residuals /= (1 - self.hat_vals).reshape(-1, 1)

# centering removes the column mean
if center:
residuals -= np.mean(residuals, axis=0)
# centering removes the column mean
if center:
residuals -= np.mean(residuals, axis=0)
else:
residuals = super().residuals(x, y, weights, K, center, **kwargs)

return residuals
41 changes: 30 additions & 11 deletions src/elexsolver/QuantileRegressionSolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def _fit(self, x: np.ndarray, y: np.ndarray, weights: np.ndarray, tau: float) ->
# b_eq is the equality constraint vector (ie. A_eq @ x = b_eq)
# bounds are the (min, max) possible values of every element of x
res = linprog(-1 * S, A_eq=Phi.T, b_eq=zeros, bounds=bounds, method="highs", options={"presolve": False})

# marginal are the dual values, since we are solving the dual this is equivalent to the primal
return -1 * res.eqlin.marginals

Expand Down Expand Up @@ -84,6 +85,7 @@ def fit(
regularize_intercept: bool = False,
n_feat_ignore_reg: int = 0,
normalize_weights: bool = True,
cache: bool = True,
):
"""
Fits quantile regression
Expand All @@ -105,23 +107,40 @@ def fit(
raise ZeroDivisionError
weights = weights / weights_sum

if y.ndim == 1: # code expects 2-dim array
y = y.reshape(-1, 1)

# _fit assumes that taus is list, so if we want to do one value of tau then turn into a list
if isinstance(taus, float):
taus = [taus]

else:
assert y.shape[1] == 1 # you can either have multiple taus or multiple ys
coefficients_array = []
for tau in taus:
if lambda_ > 0:
coefficients = self._fit_with_regularization(
x, y, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg
)
else:
coefficients = self._fit(x, y, weights, tau)
self.coefficients.append(coefficients)

def predict(self, x: np.ndarray) -> np.ndarray:
for y_arr in y.T:
if lambda_ > 0:
coefficients = self._fit_with_regularization(
x, y_arr, weights, tau, lambda_, regularize_intercept, n_feat_ignore_reg
)
else:
coefficients = self._fit(x, y_arr, weights, tau)
coefficients_array.append(coefficients)

coefficients_array = np.asarray(coefficients_array).T
if cache:
self.coefficients = coefficients_array

return coefficients_array

def predict(self, x: np.ndarray, coefficients: np.ndarray | None = None) -> np.ndarray:
"""
Use coefficients to predict
"""
self._check_any_element_nan_or_inf(x)

return self.coefficients @ x.T
if coefficients is None:
preds = x @ self.coefficients
else:
preds = x @ coefficients

return preds
7 changes: 7 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import sys

import numpy as np
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know you said you're struggling to add unit tests for all these changes and I'm curious what you think is missing. These look good to me although I'll keep thinking about it 🤔 🎉

import pandas as pd
import pytest

Expand Down Expand Up @@ -40,3 +41,9 @@ def random_data_no_weights(get_fixture):
@pytest.fixture(scope="session")
def random_data_weights(get_fixture):
return get_fixture("random_data_n100_p5_12549_weights.csv")


@pytest.fixture(scope="session")
def rng():
seed = 8232
return np.random.default_rng(seed=seed)
18 changes: 18 additions & 0 deletions tests/test_linear_solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,27 @@
import pytest

from elexsolver.LinearSolver import LinearSolver
from elexsolver.QuantileRegressionSolver import QuantileRegressionSolver


def test_fit():
solver = LinearSolver()
with pytest.raises(NotImplementedError):
solver.fit(np.ndarray((5, 3)), np.ndarray((1, 3)))


##################
# Test residuals #
##################
def test_residuals_without_weights(rng):
x = rng.normal(size=(100, 5))
beta = rng.normal(size=(5, 1))
y = x @ beta

# we need an a subclass of LinearSolver to actually run a fit
reg = QuantileRegressionSolver()
reg.fit(x, y, fit_intercept=False)
reg.predict(x)

reg.residuals(x, y, K=None, center=False)
reg.residuals(x, y, K=10, center=False)
34 changes: 25 additions & 9 deletions tests/test_ols.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,22 @@ def test_basic2():
assert all(np.abs(preds - [6.666667, 6.666667, 6.666667, 15]) <= TOL)


def test_cache():
lm = OLSRegressionSolver()
x = np.asarray([[1, 1], [1, 1], [1, 1], [1, 2]])
y = np.asarray([3, 8, 9, 15])
lm.fit(x, y, fit_intercept=True, cache=False)

assert lm.normal_eqs is None
assert lm.hat_vals is None

lm.fit(x, y, fit_intercept=True, cache=True)

assert lm.normal_eqs is not None
assert lm.hat_vals is not None
assert lm.coefficients is not None


######################
# Intermediate tests #
######################
Expand Down Expand Up @@ -209,19 +225,19 @@ def test_residuals_no_weights(random_data_no_weights):
x = random_data_no_weights[["x0", "x1", "x2", "x3", "x4"]].values
y = random_data_no_weights["y"].values.reshape(-1, 1)
lm.fit(x, y, fit_intercept=False)
predictions = lm.predict(x)

residuals = lm.residuals(y, predictions, loo=False, center=False)
lm.predict(x)

residuals = lm.residuals(x, y, K=None, center=False)
assert residuals[0] == pytest.approx(0.885973530)
assert residuals[-1] == pytest.approx(0.841996302)

residuals = lm.residuals(y, predictions, loo=True, center=False)
# equivalent of leave-one-out residuals
residuals = lm.residuals(x, y, K=100, center=False)

assert residuals[0] == pytest.approx(0.920112164)
assert residuals[-1] == pytest.approx(0.875896477)

residuals = lm.residuals(y, predictions, loo=True, center=True)
residuals = lm.residuals(x, y, K=100, center=True)
assert np.sum(residuals) == pytest.approx(0)


Expand All @@ -232,19 +248,19 @@ def test_residuals_weights(random_data_weights):
weights = random_data_weights["weights"].values

lm.fit(x, y, weights=weights, fit_intercept=False)
predictions = lm.predict(x)
lm.predict(x)

residuals = lm.residuals(y, predictions, loo=False, center=False)
residuals = lm.residuals(x, y, weights=weights, K=None, center=False)

assert residuals[0] == pytest.approx(-1.971798590)
assert residuals[-1] == pytest.approx(-1.373951578)

residuals = lm.residuals(y, predictions, loo=True, center=False)
residuals = lm.residuals(x, y, weights=weights, K=100, center=False)

assert residuals[0] == pytest.approx(-1.999718445)
assert residuals[-1] == pytest.approx(-1.438563033)

residuals = lm.residuals(y, predictions, loo=True, center=True)
residuals = lm.residuals(x, y, k=100, center=True)
assert np.sum(residuals) == pytest.approx(0)


Expand Down
Loading
Loading