Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #12

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/PRegress_Repo.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

61 changes: 33 additions & 28 deletions build/lib/pregress/modeling/box_cox.py
Original file line number Diff line number Diff line change
@@ -1,44 +1,49 @@
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox, boxcox_llf
import statsmodels.api as sm
import statsmodels.formula.api as smf

def box_cox(model):
"""
Perform a Box-Cox transformation on the response variable of a given statsmodels regression results object,
output a plot of the log-likelihood as a function of lambda, the fitted lambda, and the 95% confidence interval.

Args:
model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
"""
# Extract the response variable
y = model.model.endog

# Perform the Box-Cox transformation
if np.any(y <= 0):
raise ValueError("All values in the response variable must be positive for Box-Cox transformation.")

y_transformed, fitted_lambda = boxcox(y)
# Calculate the log-likelihood for different lambda values using boxcox_llf
lambdas = np.linspace(-2, 2, 100)

# Calculate lambdas from -3 to 3 for better CI accuracy
lambdas = np.linspace(-3, 3, 100)
log_likelihood = [boxcox_llf(lmbda, y) for lmbda in lambdas]

# Calculate the 95% confidence interval

# Plot lambdas from -2.1 to 2.1
plot_lambdas = lambdas[(lambdas >= -2.1) & (lambdas <= 2.1)]
plot_log_likelihood = [boxcox_llf(lmbda, y) for lmbda in plot_lambdas]

max_log_likelihood = boxcox_llf(fitted_lambda, y)
ci_cutoff = max_log_likelihood - 1.92 # Chi-squared distribution cutoff for 95% CI (1 degree of freedom)
ci_lambdas = lambdas[np.array(log_likelihood) >= ci_cutoff]

lambda_lower = ci_lambdas[0]
lambda_upper = ci_lambdas[-1]

# Plot the log-likelihood as a function of lambda

plt.figure(figsize=(10, 6))
plt.plot(lambdas, log_likelihood, label='Log-Likelihood')
plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Fitted Lambda: {fitted_lambda:.4f}')
plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')

# Plot the restricted range of log-likelihood from -2.1 to 2.1
plt.plot(plot_lambdas, plot_log_likelihood, label='Log-Likelihood Function')

# Set xlim to focus on the typical range of -2 to 2
plt.xlim([-2, 2])

# Set ylim based exactly on the min and max log-likelihood without additional padding
plt.ylim([min(plot_log_likelihood), max(plot_log_likelihood)+.05* (max(plot_log_likelihood) - min(plot_log_likelihood))])

if -2 <= fitted_lambda <= 2:
lambda_lower = ci_lambdas[0]
lambda_upper = ci_lambdas[-1]
plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Best Lambda: {fitted_lambda:.4f}')
plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')
else:
print(f"The fitted_lambda is {fitted_lambda:.4f}, which is outside the typical range of -2 to 2. CI lines not plotted.")

plt.xlabel('Lambda')
plt.ylabel('Log-Likelihood')
plt.title('Box-Cox Transformation Log-Likelihood with 95% CI')
plt.legend(loc='upper left')
plt.title('Log-Likelihood for Box-Cox Transformation')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()
75 changes: 50 additions & 25 deletions build/lib/pregress/modeling/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,52 +2,77 @@
import statsmodels.api as sm
import pandas as pd

def fit(formula, data=None, method = "ols", dummies = True):
def fit(formula: str, data: pd.DataFrame = None, method: str = "ols", dummies: bool = True):
"""
Fits a statistical model based on a specified formula and data.

Parameters:
- formula (str): A string representing the statistical formula (e.g., 'Y ~ X1 + X2 - X3').
- data (DataFrame, optional): The dataset containing the variables specified in the formula.
- method (str, optional): The method used for fitting the model. Defaults to 'ols' (Ordinary Least Squares).
Other methods can be implemented, such as logistic regression, random forest, etc.
Supported methods: 'ols' for linear regression, 'logistic' for logistic regression.
- dummies (bool, optional): A boolean indicating whether to automatically create dummy variables for categorical
predictors. Defaults to True.

Returns:
- model (statsmodels object): The fitted model object, which can be used for further analysis, such as
- model (statsmodels object): The fitted model object, which can be used for further analysis, such as
making predictions or evaluating model performance.

Raises:
- ValueError: If the input data is empty or the specified variables are not found in the data.
- NotImplementedError: If an unsupported method is specified.

Notes:
- The function currently supports OLS (Ordinary Least Squares) regression. Additional methods like logistic
regression, random forest, and k-nearest neighbors can be added as needed.
- The 'parse_formula' function is used to parse the formula and extract the response and predictor variables
from the dataset.
- If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator
variables, with the first category dropped to avoid multicollinearity. Additionally, binary variables
(True/False) are converted to numeric (0/1) values.
- The function currently supports OLS (Ordinary Least Squares) and logistic regression.
Additional methods like random forest or k-nearest neighbors could be added as needed.
- If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator
variables, with the first category dropped to avoid multicollinearity. Binary variables (True/False) are
converted to numeric (0/1) values.
"""


def process_dummies(X_out):
"""Helper function to handle dummy variables and binary conversions."""
X_out = pd.get_dummies(X_out, drop_first=True)

# Convert binary variables (True/False) to numeric (0/1)
binary_columns = X_out.select_dtypes(include=['bool']).columns
X_out[binary_columns] = X_out[binary_columns].astype(int)
return X_out

def check_response_and_convert(Y_out):
"""Convert categorical response variable to dummies if necessary."""
if not pd.api.types.is_numeric_dtype(Y_out):
Y_out = pd.get_dummies(Y_out, drop_first=True)
if Y_out.shape[1] > 1:
raise ValueError("Response variable was converted to multiple columns, indicating it is multi-class. "
"This function currently supports binary response variables only.")
return Y_out

Y_name, X_names, Y_out, X_out = parse_formula(formula, data)

if method.lower() == "ols":
if dummies:

X_out = pd.get_dummies(X_out, drop_first=True)

# Convert binary variables (True/False) to numeric (0/1)
binary_columns = X_out.select_dtypes(include=['bool']).columns
X_out[binary_columns] = X_out[binary_columns].astype(int)

if X_out.empty:
raise ValueError("The input data is empty or the specified variables are not found in the data.")
# Ensure Y_out is a Series and retains its name
if isinstance(Y_out, (pd.Series, pd.DataFrame)):
Y_out.name = Y_name # Retain the response variable's name
else:
# Convert numpy array to pandas Series and set name
Y_out = pd.Series(Y_out, name=Y_name)

if X_out.empty:
raise ValueError("The input data is empty or the specified variables are not found in the data.")

if dummies:
X_out = process_dummies(X_out)

if method.lower() == "ols":
model = sm.OLS(Y_out, X_out).fit()

# if method.lower() == "logistic":
# if method.lower() == "rf":
# if method.lower() == "knn":
elif method.lower() == "logistic":
# Process the response variable to ensure it is numeric or binary
Y_out = check_response_and_convert(Y_out)
model = sm.GLM(Y_out, X_out, family=sm.families.Binomial()).fit()

else:
raise NotImplementedError(f"Method '{method}' is not implemented. Supported methods: 'ols', 'logistic'.")

return model

Loading