diff --git a/.DS_Store b/.DS_Store
index 87ddc21..848990e 100644
Binary files a/.DS_Store and b/.DS_Store differ
diff --git a/.idea/.gitignore b/.idea/.gitignore
new file mode 100644
index 0000000..26d3352
--- /dev/null
+++ b/.idea/.gitignore
@@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
diff --git a/.idea/PRegress_Repo.iml b/.idea/PRegress_Repo.iml
new file mode 100644
index 0000000..0070e87
--- /dev/null
+++ b/.idea/PRegress_Repo.iml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..105ce2d
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
new file mode 100644
index 0000000..41c2a23
--- /dev/null
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000..35eb1dd
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,6 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/build/lib/pregress/modeling/box_cox.py b/build/lib/pregress/modeling/box_cox.py
index 5ce1bd5..c41788f 100644
--- a/build/lib/pregress/modeling/box_cox.py
+++ b/build/lib/pregress/modeling/box_cox.py
@@ -1,44 +1,49 @@
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox, boxcox_llf
-import statsmodels.api as sm
-import statsmodels.formula.api as smf
def box_cox(model):
- """
- Perform a Box-Cox transformation on the response variable of a given statsmodels regression results object,
- output a plot of the log-likelihood as a function of lambda, the fitted lambda, and the 95% confidence interval.
-
- Args:
- model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
- """
- # Extract the response variable
y = model.model.endog
-
- # Perform the Box-Cox transformation
+ if np.any(y <= 0):
+ raise ValueError("All values in the response variable must be positive for Box-Cox transformation.")
+
y_transformed, fitted_lambda = boxcox(y)
-
- # Calculate the log-likelihood for different lambda values using boxcox_llf
- lambdas = np.linspace(-2, 2, 100)
+
+ # Calculate lambdas from -3 to 3 for better CI accuracy
+ lambdas = np.linspace(-3, 3, 100)
log_likelihood = [boxcox_llf(lmbda, y) for lmbda in lambdas]
-
- # Calculate the 95% confidence interval
+
+ # Plot lambdas from -2.1 to 2.1
+ plot_lambdas = lambdas[(lambdas >= -2.1) & (lambdas <= 2.1)]
+ plot_log_likelihood = [boxcox_llf(lmbda, y) for lmbda in plot_lambdas]
+
max_log_likelihood = boxcox_llf(fitted_lambda, y)
ci_cutoff = max_log_likelihood - 1.92 # Chi-squared distribution cutoff for 95% CI (1 degree of freedom)
ci_lambdas = lambdas[np.array(log_likelihood) >= ci_cutoff]
-
- lambda_lower = ci_lambdas[0]
- lambda_upper = ci_lambdas[-1]
-
- # Plot the log-likelihood as a function of lambda
+
plt.figure(figsize=(10, 6))
- plt.plot(lambdas, log_likelihood, label='Log-Likelihood')
- plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Fitted Lambda: {fitted_lambda:.4f}')
- plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
- plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')
+
+ # Plot the restricted range of log-likelihood from -2.1 to 2.1
+ plt.plot(plot_lambdas, plot_log_likelihood, label='Log-Likelihood Function')
+
+ # Set xlim to focus on the typical range of -2 to 2
+ plt.xlim([-2, 2])
+
+ # Set ylim based exactly on the min and max log-likelihood without additional padding
+ plt.ylim([min(plot_log_likelihood), max(plot_log_likelihood)+.05* (max(plot_log_likelihood) - min(plot_log_likelihood))])
+
+ if -2 <= fitted_lambda <= 2:
+ lambda_lower = ci_lambdas[0]
+ lambda_upper = ci_lambdas[-1]
+ plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
+ plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Best Lambda: {fitted_lambda:.4f}')
+ plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')
+ else:
+ print(f"The fitted_lambda is {fitted_lambda:.4f}, which is outside the typical range of -2 to 2. CI lines not plotted.")
+
plt.xlabel('Lambda')
plt.ylabel('Log-Likelihood')
- plt.title('Box-Cox Transformation Log-Likelihood with 95% CI')
- plt.legend(loc='upper left')
+ plt.title('Log-Likelihood for Box-Cox Transformation')
+ plt.legend(loc='lower right')
plt.grid(True)
plt.show()
diff --git a/build/lib/pregress/modeling/fit.py b/build/lib/pregress/modeling/fit.py
index a6bc58d..03dde46 100644
--- a/build/lib/pregress/modeling/fit.py
+++ b/build/lib/pregress/modeling/fit.py
@@ -2,7 +2,7 @@
import statsmodels.api as sm
import pandas as pd
-def fit(formula, data=None, method = "ols", dummies = True):
+def fit(formula: str, data: pd.DataFrame = None, method: str = "ols", dummies: bool = True):
"""
Fits a statistical model based on a specified formula and data.
@@ -10,44 +10,69 @@ def fit(formula, data=None, method = "ols", dummies = True):
- formula (str): A string representing the statistical formula (e.g., 'Y ~ X1 + X2 - X3').
- data (DataFrame, optional): The dataset containing the variables specified in the formula.
- method (str, optional): The method used for fitting the model. Defaults to 'ols' (Ordinary Least Squares).
- Other methods can be implemented, such as logistic regression, random forest, etc.
+ Supported methods: 'ols' for linear regression, 'logistic' for logistic regression.
- dummies (bool, optional): A boolean indicating whether to automatically create dummy variables for categorical
predictors. Defaults to True.
Returns:
- - model (statsmodels object): The fitted model object, which can be used for further analysis, such as
+ - model (statsmodels object): The fitted model object, which can be used for further analysis, such as
making predictions or evaluating model performance.
Raises:
- ValueError: If the input data is empty or the specified variables are not found in the data.
+ - NotImplementedError: If an unsupported method is specified.
Notes:
- - The function currently supports OLS (Ordinary Least Squares) regression. Additional methods like logistic
- regression, random forest, and k-nearest neighbors can be added as needed.
- - The 'parse_formula' function is used to parse the formula and extract the response and predictor variables
- from the dataset.
- - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator
- variables, with the first category dropped to avoid multicollinearity. Additionally, binary variables
- (True/False) are converted to numeric (0/1) values.
+ - The function currently supports OLS (Ordinary Least Squares) and logistic regression.
+ Additional methods like random forest or k-nearest neighbors could be added as needed.
+ - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator
+ variables, with the first category dropped to avoid multicollinearity. Binary variables (True/False) are
+ converted to numeric (0/1) values.
"""
-
+
+ def process_dummies(X_out):
+ """Helper function to handle dummy variables and binary conversions."""
+ X_out = pd.get_dummies(X_out, drop_first=True)
+
+ # Convert binary variables (True/False) to numeric (0/1)
+ binary_columns = X_out.select_dtypes(include=['bool']).columns
+ X_out[binary_columns] = X_out[binary_columns].astype(int)
+ return X_out
+
+ def check_response_and_convert(Y_out):
+ """Convert categorical response variable to dummies if necessary."""
+ if not pd.api.types.is_numeric_dtype(Y_out):
+ Y_out = pd.get_dummies(Y_out, drop_first=True)
+ if Y_out.shape[1] > 1:
+ raise ValueError("Response variable was converted to multiple columns, indicating it is multi-class. "
+ "This function currently supports binary response variables only.")
+ return Y_out
+
Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
-
- if method.lower() == "ols":
- if dummies:
-
- X_out = pd.get_dummies(X_out, drop_first=True)
-
- # Convert binary variables (True/False) to numeric (0/1)
- binary_columns = X_out.select_dtypes(include=['bool']).columns
- X_out[binary_columns] = X_out[binary_columns].astype(int)
- if X_out.empty:
- raise ValueError("The input data is empty or the specified variables are not found in the data.")
+ # Ensure Y_out is a Series and retains its name
+ if isinstance(Y_out, (pd.Series, pd.DataFrame)):
+ Y_out.name = Y_name # Retain the response variable's name
+ else:
+ # Convert numpy array to pandas Series and set name
+ Y_out = pd.Series(Y_out, name=Y_name)
+
+ if X_out.empty:
+ raise ValueError("The input data is empty or the specified variables are not found in the data.")
+ if dummies:
+ X_out = process_dummies(X_out)
+
+ if method.lower() == "ols":
model = sm.OLS(Y_out, X_out).fit()
-# if method.lower() == "logistic":
-# if method.lower() == "rf":
-# if method.lower() == "knn":
+ elif method.lower() == "logistic":
+ # Process the response variable to ensure it is numeric or binary
+ Y_out = check_response_and_convert(Y_out)
+ model = sm.GLM(Y_out, X_out, family=sm.families.Binomial()).fit()
+
+ else:
+ raise NotImplementedError(f"Method '{method}' is not implemented. Supported methods: 'ols', 'logistic'.")
+
return model
+
diff --git a/build/lib/pregress/modeling/summary.py b/build/lib/pregress/modeling/summary.py
index bd8453f..1965a0b 100644
--- a/build/lib/pregress/modeling/summary.py
+++ b/build/lib/pregress/modeling/summary.py
@@ -1,14 +1,15 @@
-from .format_summary import format_summary
-from .print_r_summary import print_r_summary
-from .print_anova_table import print_anova_table
-from .print_stata_summary import print_stata_summary
-from .significance_code import significance_code
+from pregress.modeling.format_summary import format_summary
+from pregress.modeling.print_r_summary import print_r_summary
+from pregress.modeling.print_anova_table import print_anova_table
+from pregress.modeling.print_stata_summary import print_stata_summary
+from pregress.modeling.significance_code import significance_code
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings
from io import StringIO
+
def summary(model, out='simple', level=0.95):
"""
Generates and prints a summary of the regression model fit. The default summary is 'simple',
@@ -33,52 +34,140 @@ def summary(model, out='simple', level=0.95):
if out in ['statsmodels', 'stats']:
print(model.summary(alpha=alpha))
return
+
+ def print_model_type(model):
+ if isinstance(model, sm.regression.linear_model.RegressionResultsWrapper):
+ if isinstance(model.model, sm.OLS):
+ model_type = "ols"
+ elif isinstance(model.model, sm.Logit):
+ model_type = "logit"
+ elif isinstance(model.model, sm.GLM):
+ if isinstance(model.model.family, sm.families.Binomial):
+ model_type = "glm"
+ else:
+ model_type = "glm_nonlogit"
+ else:
+ model_type = "Other Regression Model"
+ else:
+ model_type = "Unsupported model type."
+
+ return model_type
+
+ def summary_ols(model, out='simple', level=0.95):
+
+ alpha = round(1 - level, 5) # Ensure alpha is correctly formatted
+
+ warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20")
+ warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning)
+ results_summary = model.summary(alpha=alpha)
+ results_as_html = results_summary.tables[1].as_html()
+ summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]
+ summary_df = format_summary(summary_df, alpha)
+
+ p_values = model.pvalues
+ conf_intervals = model.conf_int(alpha=alpha)
+ r_squared = model.rsquared
+ adj_r_squared = model.rsquared_adj
+ f_statistic = model.fvalue
+ f_p_value = model.f_pvalue
+ log_likelihood = model.llf
+ aic = model.aic
+ bic = model.bic
+ RSS = np.sum(model.resid**2)
+ df = model.df_resid
+ RSE = np.sqrt(RSS / df)
+ n_obs = int(model.nobs)
+ df_model = model.df_model
+ df_resid = model.df_resid
+ mse_model = model.mse_model
+ mse_resid = model.mse_resid
+
+ if out == 'r':
+ print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value)
+ elif out == 'simple':
+ print("Summary of Regression Analysis:")
+ print("======================================================")
+ print("\nCoefficients:")
+ print("------------------------------------------------------")
+ print(summary_df)
+ print("\nModel Statistics:")
+ print("------------------------------------------------------")
+ print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}")
+ print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}")
+ print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}")
+ print("======================================================")
+ elif out in ['coefficients', 'coef']:
+ print(model.summary(alpha=alpha).tables[1])
+ elif out == 'anova':
+ anova_table = print_anova_table(model)
+ print(anova_table)
+ elif out == 'stata':
+ print_stata_summary(model, summary_df, conf_intervals, level)
+ else:
+ raise ValueError("Unsupported summary type specified.")
- warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20")
- warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning)
- results_summary = model.summary(alpha=alpha)
- results_as_html = results_summary.tables[1].as_html()
- summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]
- summary_df = format_summary(summary_df, alpha)
-
- p_values = model.pvalues
- conf_intervals = model.conf_int(alpha=alpha)
- r_squared = model.rsquared
- adj_r_squared = model.rsquared_adj
- f_statistic = model.fvalue
- f_p_value = model.f_pvalue
- log_likelihood = model.llf
- aic = model.aic
- bic = model.bic
- RSS = np.sum(model.resid**2)
- df = model.df_resid
- RSE = np.sqrt(RSS / df)
- n_obs = int(model.nobs)
- df_model = model.df_model
- df_resid = model.df_resid
- mse_model = model.mse_model
- mse_resid = model.mse_resid
-
- if out == 'r':
- print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value)
- elif out == 'simple':
- print("Summary of Regression Analysis:")
- print("======================================================")
- print("\nCoefficients:")
- print("------------------------------------------------------")
- print(summary_df)
- print("\nModel Statistics:")
- print("------------------------------------------------------")
- print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}")
- print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}")
- print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}")
- print("======================================================")
- elif out in ['coefficients', 'coef']:
- print(model.summary(alpha=alpha).tables[1])
- elif out == 'anova':
- anova_table = print_anova_table(model)
- print(anova_table)
- elif out == 'stata':
- print_stata_summary(model, summary_df, conf_intervals, level)
+ def summary_logistic(model, out='simple', level=0.95):
+ """
+ Generates a summary of a logistic regression model using statsmodels.
+
+ Args:
+ model: The logistic regression model object from statsmodels.
+ out (str): Type of summary output. Options include 'simple', 'statsmodels', 'R', 'STATA',
+ 'coefficients', and 'ANOVA' (not typically applicable).
+ level (float): Confidence level for the confidence intervals. Default is 0.95.
+
+ Returns:
+ Various types of summaries depending on the input type, both printed and returned.
+ """
+
+ # Suppress Warnings
+ warnings.filterwarnings("ignore", category=FutureWarning,
+ message="The bic value is computed using the deviance formula.*")
+ warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ConvergenceWarning)
+
+ alpha = round(1 - level, 5) # Ensure alpha is correctly formatted
+
+ results_summary = model.summary(alpha=alpha)
+ results_as_html = results_summary.tables[1].as_html()
+ summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]
+ summary_df = format_summary(summary_df, alpha)
+
+ conf_intervals = model.conf_int(alpha=alpha)
+ log_likelihood = model.llf
+ aic = model.aic
+ bic = model.bic
+ pseudo_r_squared = model.pseudo_rsquared(kind='cs')
+ n_obs = int(model.nobs)
+
+ if out == 'r':
+ print("Not currently available for logistic regression.")
+ elif out == 'simple':
+ print("Summary of Logistic Regression Analysis:")
+ print("======================================================")
+ print("\nCoefficients (Odds Ratios):")
+ print("------------------------------------------------------")
+ print(summary_df)
+ print("\nModel Statistics:")
+ print("------------------------------------------------------")
+ print(f"Log-Likelihood: {log_likelihood:.4f} AIC: {aic:.4f}")
+ print(f"Pseudo R-squared: {pseudo_r_squared:.4f} BIC: {bic:.4f}")
+ print("======================================================")
+ elif out in ['coefficients', 'coef']:
+ print(results_summary.tables[1])
+ elif out == 'anova':
+ print("ANOVA table not applicable for logistic regression.")
+ elif out == 'stata':
+ print("Not currently available for logistic regression.")
+ else:
+ raise ValueError("Unsupported summary type specified.")
+
+ return results_summary
+
+ model_type = print_model_type(model)
+
+ if model_type == "ols":
+ summary_ols(model, out, level)
+ elif model_type == "glm":
+ summary_logistic(model, out, level)
else:
- raise ValueError("Unsupported summary type specified.")
+ raise ValueError("Unsupported model type.")
diff --git a/build/lib/pregress/plots/barplot.py b/build/lib/pregress/plots/barplot.py
index bd05dd4..e87f39d 100644
--- a/build/lib/pregress/plots/barplot.py
+++ b/build/lib/pregress/plots/barplot.py
@@ -22,6 +22,9 @@ def barplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Barplots
Returns:
None. The function creates and shows bar plots.
"""
+ if isinstance(formula, pd.DataFrame):
+ data = formula
+ formula = None
if formula is not None:
formula = formula + "+0"
Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
diff --git a/build/lib/pregress/plots/boxplot.py b/build/lib/pregress/plots/boxplot.py
index f60f798..8e7f5ed 100644
--- a/build/lib/pregress/plots/boxplot.py
+++ b/build/lib/pregress/plots/boxplot.py
@@ -9,10 +9,10 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots
Generates and prints boxplots for all numeric variables specified in the formula or all numeric variables in the data if no formula is provided.
Args:
- formula (str, optional): Formula to define the model (dependent ~ independent).
+ formula (str, optional): Formula to define the model (Y ~ X).
data (DataFrame, optional): Data frame containing the data.
- xcolor (str, optional): Color of the boxplots for the independent variables.
- ycolor (str, optional): Color of the boxplots for the dependent variable.
+ xcolor (str, optional): Color of the boxplots for the predictor variables.
+ ycolor (str, optional): Color of the boxplots for the response variable.
main (str, optional): Title of the plot.
xlab (str, optional): Label for the x-axis.
ylab (str, optional): Label for the y-axis.
@@ -67,4 +67,4 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots
if subplot is None:
plt.show()
plt.clf()
- plt.close()
\ No newline at end of file
+ plt.close()
diff --git a/build/lib/pregress/plots/hist.py b/build/lib/pregress/plots/hist.py
index ac8444b..1121639 100644
--- a/build/lib/pregress/plots/hist.py
+++ b/build/lib/pregress/plots/hist.py
@@ -3,8 +3,9 @@
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm as normal_dist
+import inspect
-def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot = None):
+def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot=None):
"""
Generates and prints a histogram for a given vector.
@@ -16,6 +17,7 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None,
main (str, optional): Title for the histogram.
xlab (str, optional): Label for the x-axis.
ylab (str, optional): Label for the y-axis.
+ subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index).
Returns:
None. The function creates and shows the histogram.
@@ -31,9 +33,11 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None,
xlab = [var_name for var_name, var_val in callers_local_vars if var_val is vector]
xlab = xlab[0] if xlab else 'Variable'
- # Clear any existing plots
- plt.clf()
- plt.close()
+ # If a subplot is specified, create a subplot within the grid
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure()
# Create the histogram
sns.histplot(vector, bins=bins, kde=False, color=color, edgecolor='black')
@@ -51,13 +55,6 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None,
plt.xlabel(xlab)
plt.ylabel(ylab)
- # Show the plot if subplot is not specified or if it is the last subplot
+ # Only show the plot if it's not part of a subplot
if subplot is None:
plt.show()
- plt.clf()
- plt.close()
-
-
-
-
-
diff --git a/build/lib/pregress/plots/hist_res.py b/build/lib/pregress/plots/hist_res.py
index ad96b34..0393dee 100644
--- a/build/lib/pregress/plots/hist_res.py
+++ b/build/lib/pregress/plots/hist_res.py
@@ -1,23 +1,33 @@
-from .hist import hist
from pregress.modeling.fit import fit
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats # Import for statistical functions
-def hist_res(model, subplot=None):
+
+def hist_res(model, main="Histogram of Residuals", xlab="Residuals", ylab="Density", subplot=None):
"""
Plots a histogram of the residuals of a fitted statsmodels regression model and overlays a normal distribution curve.
Args:
model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
+ main (str, optional): Title for the histogram plot.
+ xlab (str, optional): Label for the x-axis.
+ ylab (str, optional): Label for the y-axis.
+ subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created.
Returns:
None. Displays a histogram of residuals with a normal distribution curve.
"""
-
+
# Calculate residuals
residuals = model.resid
+ # If a subplot is specified, create the subplot; otherwise, create a new figure
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure()
+
# Plot histogram of the residuals
plt.hist(residuals, bins=30, color='blue', alpha=0.7, density=True, label='Residuals Histogram')
@@ -32,15 +42,13 @@ def hist_res(model, subplot=None):
# Plot the normal distribution curve
plt.plot(x, p, 'k', linewidth=2, label='Normal Distribution')
- # Update the title
- plt.title('Histogram of Residuals')
-
- # Add labels and move the legend to the upper left corner
- plt.xlabel('Residuals')
- plt.ylabel('Density')
+ # Set title and labels using the new arguments
+ plt.title(main)
+ plt.xlabel(xlab)
+ plt.ylabel(ylab)
plt.legend(loc='upper left')
- # Show the plot if subplot is not specified
+ # Show the plot only if no subplot is provided
if subplot is None:
plt.show()
plt.clf()
diff --git a/build/lib/pregress/plots/plot_cor.py b/build/lib/pregress/plots/plot_cor.py
index 30fb531..7cb7a39 100644
--- a/build/lib/pregress/plots/plot_cor.py
+++ b/build/lib/pregress/plots/plot_cor.py
@@ -2,32 +2,54 @@
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
+from pregress.modeling.parse_formula import parse_formula
-def plot_cor(df, main='Correlation Matrix', subplot=None):
+def plot_cor(formula, data=None, main='Correlation Matrix', subplot=None, **kwargs):
"""
- Generates a heatmap for the correlation matrix of a DataFrame.
+ Generates a heatmap for the correlation matrix of a dataframe.
Args:
- df (pandas.DataFrame): The DataFrame for which to compute the correlation matrix.
+ formula (str or pandas.DataFrame): The formula or dataframe for which to compute the correlation matrix.
+ data (pandas.DataFrame, optional): The dataframe for formula evaluation if a formula is provided.
main (str, optional): Main title of the plot.
- xlab (str, optional): Label for the x-axis.
- ylab (str, optional): Label for the y-axis.
+ subplot (optional): Subplot for embedding the heatmap.
+ kwargs: Additional keyword arguments for sns.heatmap() (e.g., annot, cmap, square, vmax, vmin, linewidths, etc.)
Returns:
None. Displays the heatmap.
"""
+
+ if isinstance(formula, pd.DataFrame):
+ data = formula
+ formula = None
+
+ if formula is not None:
+ formula = formula + "+0"
+ Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
+ # Combine Y and X data for the correlation matrix
+ data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1)
+
# Calculate the correlation matrix
- corr_matrix = df.corr()
+ corr_matrix = data.corr()
# Set the diagonal elements to NaN to make them white
np.fill_diagonal(corr_matrix.values, np.nan)
- # Create a custom colormap with black for NaN values
- cmap = sns.color_palette("coolwarm", as_cmap=True)
- cmap.set_bad(color='black')
+ # Set default values if not already provided in kwargs
+ kwargs.setdefault('annot', True)
+ kwargs.setdefault('square', True)
+ kwargs.setdefault('vmax', 1)
+ kwargs.setdefault('vmin', -1)
+ kwargs.setdefault('linewidths', 0.5)
+
+ # If cmap is not provided in kwargs, set a default cmap with NaN handling
+ if 'cmap' not in kwargs:
+ cmap = sns.color_palette("coolwarm", as_cmap=True)
+ cmap.set_bad(color='black') # Make NaN values appear in black
+ kwargs['cmap'] = cmap
- # Draw the heatmap
- sns.heatmap(corr_matrix, annot=True, cmap=cmap, vmax=1, vmin=-1, square=True, linewidths=.5)
+ # Draw the heatmap, passing in all kwargs dynamically
+ sns.heatmap(corr_matrix, **kwargs)
# Add main title
plt.title(main, fontsize=18)
diff --git a/build/lib/pregress/plots/plot_res.py b/build/lib/pregress/plots/plot_res.py
index dfd5e16..cc14ed3 100644
--- a/build/lib/pregress/plots/plot_res.py
+++ b/build/lib/pregress/plots/plot_res.py
@@ -2,37 +2,40 @@
import numpy as np
import scipy.stats as stats
+
def plot_res(model, subplot=None):
"""
Plots the residuals of a fitted statsmodels regression model.
Args:
model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
+ subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created.
Returns:
None. Displays a residual plot.
"""
-
+
# Calculate residuals
residuals = model.resid
# Calculate fitted values
fitted = model.predict()
+ # If a subplot is specified, create the subplot; otherwise, create a new figure
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure()
+
# Create the residual plot
plt.scatter(fitted, residuals, color='blue')
plt.axhline(0, color='red', linestyle='--') # Adds a horizontal line at zero
plt.xlabel('Fitted values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
-
- # Show the plot if subplot is not specified
+
+ # Show the plot only if no subplot is provided
if subplot is None:
plt.show()
plt.clf()
plt.close()
-
-
-
-
-
diff --git a/dist/.DS_Store b/dist/.DS_Store
deleted file mode 100644
index 5008ddf..0000000
Binary files a/dist/.DS_Store and /dev/null differ
diff --git a/dist/pregress-1.0.2.tar.gz b/dist/pregress-1.0.2.tar.gz
deleted file mode 100644
index 98c3c68..0000000
Binary files a/dist/pregress-1.0.2.tar.gz and /dev/null differ
diff --git a/dist/pregress-1.0.2-py3-none-any.whl b/dist/pregress-1.0.4-py3-none-any.whl
similarity index 90%
rename from dist/pregress-1.0.2-py3-none-any.whl
rename to dist/pregress-1.0.4-py3-none-any.whl
index 15ce070..4cd2e17 100644
Binary files a/dist/pregress-1.0.2-py3-none-any.whl and b/dist/pregress-1.0.4-py3-none-any.whl differ
diff --git a/dist/pregress-1.0.4.tar.gz b/dist/pregress-1.0.4.tar.gz
new file mode 100644
index 0000000..486d551
Binary files /dev/null and b/dist/pregress-1.0.4.tar.gz differ
diff --git a/pregress.egg-info/PKG-INFO b/pregress.egg-info/PKG-INFO
index 7884716..60324fc 100644
--- a/pregress.egg-info/PKG-INFO
+++ b/pregress.egg-info/PKG-INFO
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: pregress
-Version: 1.0.2
+Version: 1.0.4
Summary: Python Regression Analysis.
Home-page: https://github.com/danmcgib/pregress
Author: Daniel McGibney
diff --git a/pregress/.DS_Store b/pregress/.DS_Store
index 0738cae..309b31b 100644
Binary files a/pregress/.DS_Store and b/pregress/.DS_Store differ
diff --git a/pregress/__pycache__/__init__.cpython-311.pyc b/pregress/__pycache__/__init__.cpython-311.pyc
index 06e322c..f06eaa4 100644
Binary files a/pregress/__pycache__/__init__.cpython-311.pyc and b/pregress/__pycache__/__init__.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/__init__.cpython-311.pyc b/pregress/modeling/__pycache__/__init__.cpython-311.pyc
index 1b4a4c4..758ed59 100644
Binary files a/pregress/modeling/__pycache__/__init__.cpython-311.pyc and b/pregress/modeling/__pycache__/__init__.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc b/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc
index b2cee21..c3eefb8 100644
Binary files a/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc and b/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc b/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc
index 13e81d1..241aef0 100644
Binary files a/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc and b/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/box_cox.cpython-310.pyc b/pregress/modeling/__pycache__/box_cox.cpython-310.pyc
index 6c59550..f7e1601 100644
Binary files a/pregress/modeling/__pycache__/box_cox.cpython-310.pyc and b/pregress/modeling/__pycache__/box_cox.cpython-310.pyc differ
diff --git a/pregress/modeling/__pycache__/box_cox.cpython-311.pyc b/pregress/modeling/__pycache__/box_cox.cpython-311.pyc
new file mode 100644
index 0000000..dd14eec
Binary files /dev/null and b/pregress/modeling/__pycache__/box_cox.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/bp_test.cpython-311.pyc b/pregress/modeling/__pycache__/bp_test.cpython-311.pyc
new file mode 100644
index 0000000..0bdc902
Binary files /dev/null and b/pregress/modeling/__pycache__/bp_test.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/bsr.cpython-311.pyc b/pregress/modeling/__pycache__/bsr.cpython-311.pyc
new file mode 100644
index 0000000..8bf6242
Binary files /dev/null and b/pregress/modeling/__pycache__/bsr.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc b/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc
index ec0e1a7..7ff68f0 100644
Binary files a/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc and b/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/fit.cpython-310.pyc b/pregress/modeling/__pycache__/fit.cpython-310.pyc
index a579c79..39ef806 100644
Binary files a/pregress/modeling/__pycache__/fit.cpython-310.pyc and b/pregress/modeling/__pycache__/fit.cpython-310.pyc differ
diff --git a/pregress/modeling/__pycache__/fit.cpython-311.pyc b/pregress/modeling/__pycache__/fit.cpython-311.pyc
index 40f8a6d..b4a4470 100644
Binary files a/pregress/modeling/__pycache__/fit.cpython-311.pyc and b/pregress/modeling/__pycache__/fit.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/intervals.cpython-311.pyc b/pregress/modeling/__pycache__/intervals.cpython-311.pyc
new file mode 100644
index 0000000..da1f725
Binary files /dev/null and b/pregress/modeling/__pycache__/intervals.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc b/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc
index 3349b56..5189347 100644
Binary files a/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc and b/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/predict.cpython-311.pyc b/pregress/modeling/__pycache__/predict.cpython-311.pyc
index 70f00a5..f9f11d1 100644
Binary files a/pregress/modeling/__pycache__/predict.cpython-311.pyc and b/pregress/modeling/__pycache__/predict.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/shapiro_test.cpython-311.pyc b/pregress/modeling/__pycache__/shapiro_test.cpython-311.pyc
new file mode 100644
index 0000000..3848bce
Binary files /dev/null and b/pregress/modeling/__pycache__/shapiro_test.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/step.cpython-311.pyc b/pregress/modeling/__pycache__/step.cpython-311.pyc
new file mode 100644
index 0000000..0fe6838
Binary files /dev/null and b/pregress/modeling/__pycache__/step.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/summary.cpython-310.pyc b/pregress/modeling/__pycache__/summary.cpython-310.pyc
index ed4d0b7..44822f5 100644
Binary files a/pregress/modeling/__pycache__/summary.cpython-310.pyc and b/pregress/modeling/__pycache__/summary.cpython-310.pyc differ
diff --git a/pregress/modeling/__pycache__/summary.cpython-311.pyc b/pregress/modeling/__pycache__/summary.cpython-311.pyc
index 1daf53b..f8f9edd 100644
Binary files a/pregress/modeling/__pycache__/summary.cpython-311.pyc and b/pregress/modeling/__pycache__/summary.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/vif.cpython-311.pyc b/pregress/modeling/__pycache__/vif.cpython-311.pyc
new file mode 100644
index 0000000..922cc40
Binary files /dev/null and b/pregress/modeling/__pycache__/vif.cpython-311.pyc differ
diff --git a/pregress/modeling/__pycache__/xy_split.cpython-311.pyc b/pregress/modeling/__pycache__/xy_split.cpython-311.pyc
new file mode 100644
index 0000000..6163e37
Binary files /dev/null and b/pregress/modeling/__pycache__/xy_split.cpython-311.pyc differ
diff --git a/pregress/modeling/box_cox.py b/pregress/modeling/box_cox.py
index 5ce1bd5..c41788f 100644
--- a/pregress/modeling/box_cox.py
+++ b/pregress/modeling/box_cox.py
@@ -1,44 +1,49 @@
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import boxcox, boxcox_llf
-import statsmodels.api as sm
-import statsmodels.formula.api as smf
def box_cox(model):
- """
- Perform a Box-Cox transformation on the response variable of a given statsmodels regression results object,
- output a plot of the log-likelihood as a function of lambda, the fitted lambda, and the 95% confidence interval.
-
- Args:
- model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
- """
- # Extract the response variable
y = model.model.endog
-
- # Perform the Box-Cox transformation
+ if np.any(y <= 0):
+ raise ValueError("All values in the response variable must be positive for Box-Cox transformation.")
+
y_transformed, fitted_lambda = boxcox(y)
-
- # Calculate the log-likelihood for different lambda values using boxcox_llf
- lambdas = np.linspace(-2, 2, 100)
+
+ # Calculate lambdas from -3 to 3 for better CI accuracy
+ lambdas = np.linspace(-3, 3, 100)
log_likelihood = [boxcox_llf(lmbda, y) for lmbda in lambdas]
-
- # Calculate the 95% confidence interval
+
+ # Plot lambdas from -2.1 to 2.1
+ plot_lambdas = lambdas[(lambdas >= -2.1) & (lambdas <= 2.1)]
+ plot_log_likelihood = [boxcox_llf(lmbda, y) for lmbda in plot_lambdas]
+
max_log_likelihood = boxcox_llf(fitted_lambda, y)
ci_cutoff = max_log_likelihood - 1.92 # Chi-squared distribution cutoff for 95% CI (1 degree of freedom)
ci_lambdas = lambdas[np.array(log_likelihood) >= ci_cutoff]
-
- lambda_lower = ci_lambdas[0]
- lambda_upper = ci_lambdas[-1]
-
- # Plot the log-likelihood as a function of lambda
+
plt.figure(figsize=(10, 6))
- plt.plot(lambdas, log_likelihood, label='Log-Likelihood')
- plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Fitted Lambda: {fitted_lambda:.4f}')
- plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
- plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')
+
+ # Plot the restricted range of log-likelihood from -2.1 to 2.1
+ plt.plot(plot_lambdas, plot_log_likelihood, label='Log-Likelihood Function')
+
+ # Set xlim to focus on the typical range of -2 to 2
+ plt.xlim([-2, 2])
+
+ # Set ylim based exactly on the min and max log-likelihood without additional padding
+ plt.ylim([min(plot_log_likelihood), max(plot_log_likelihood)+.05* (max(plot_log_likelihood) - min(plot_log_likelihood))])
+
+ if -2 <= fitted_lambda <= 2:
+ lambda_lower = ci_lambdas[0]
+ lambda_upper = ci_lambdas[-1]
+ plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
+ plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Best Lambda: {fitted_lambda:.4f}')
+ plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')
+ else:
+ print(f"The fitted_lambda is {fitted_lambda:.4f}, which is outside the typical range of -2 to 2. CI lines not plotted.")
+
plt.xlabel('Lambda')
plt.ylabel('Log-Likelihood')
- plt.title('Box-Cox Transformation Log-Likelihood with 95% CI')
- plt.legend(loc='upper left')
+ plt.title('Log-Likelihood for Box-Cox Transformation')
+ plt.legend(loc='lower right')
plt.grid(True)
plt.show()
diff --git a/pregress/modeling/fit.py b/pregress/modeling/fit.py
index a6bc58d..03dde46 100644
--- a/pregress/modeling/fit.py
+++ b/pregress/modeling/fit.py
@@ -2,7 +2,7 @@
import statsmodels.api as sm
import pandas as pd
-def fit(formula, data=None, method = "ols", dummies = True):
+def fit(formula: str, data: pd.DataFrame = None, method: str = "ols", dummies: bool = True):
"""
Fits a statistical model based on a specified formula and data.
@@ -10,44 +10,69 @@ def fit(formula, data=None, method = "ols", dummies = True):
- formula (str): A string representing the statistical formula (e.g., 'Y ~ X1 + X2 - X3').
- data (DataFrame, optional): The dataset containing the variables specified in the formula.
- method (str, optional): The method used for fitting the model. Defaults to 'ols' (Ordinary Least Squares).
- Other methods can be implemented, such as logistic regression, random forest, etc.
+ Supported methods: 'ols' for linear regression, 'logistic' for logistic regression.
- dummies (bool, optional): A boolean indicating whether to automatically create dummy variables for categorical
predictors. Defaults to True.
Returns:
- - model (statsmodels object): The fitted model object, which can be used for further analysis, such as
+ - model (statsmodels object): The fitted model object, which can be used for further analysis, such as
making predictions or evaluating model performance.
Raises:
- ValueError: If the input data is empty or the specified variables are not found in the data.
+ - NotImplementedError: If an unsupported method is specified.
Notes:
- - The function currently supports OLS (Ordinary Least Squares) regression. Additional methods like logistic
- regression, random forest, and k-nearest neighbors can be added as needed.
- - The 'parse_formula' function is used to parse the formula and extract the response and predictor variables
- from the dataset.
- - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator
- variables, with the first category dropped to avoid multicollinearity. Additionally, binary variables
- (True/False) are converted to numeric (0/1) values.
+ - The function currently supports OLS (Ordinary Least Squares) and logistic regression.
+ Additional methods like random forest or k-nearest neighbors could be added as needed.
+ - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator
+ variables, with the first category dropped to avoid multicollinearity. Binary variables (True/False) are
+ converted to numeric (0/1) values.
"""
-
+
+ def process_dummies(X_out):
+ """Helper function to handle dummy variables and binary conversions."""
+ X_out = pd.get_dummies(X_out, drop_first=True)
+
+ # Convert binary variables (True/False) to numeric (0/1)
+ binary_columns = X_out.select_dtypes(include=['bool']).columns
+ X_out[binary_columns] = X_out[binary_columns].astype(int)
+ return X_out
+
+ def check_response_and_convert(Y_out):
+ """Convert categorical response variable to dummies if necessary."""
+ if not pd.api.types.is_numeric_dtype(Y_out):
+ Y_out = pd.get_dummies(Y_out, drop_first=True)
+ if Y_out.shape[1] > 1:
+ raise ValueError("Response variable was converted to multiple columns, indicating it is multi-class. "
+ "This function currently supports binary response variables only.")
+ return Y_out
+
Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
-
- if method.lower() == "ols":
- if dummies:
-
- X_out = pd.get_dummies(X_out, drop_first=True)
-
- # Convert binary variables (True/False) to numeric (0/1)
- binary_columns = X_out.select_dtypes(include=['bool']).columns
- X_out[binary_columns] = X_out[binary_columns].astype(int)
- if X_out.empty:
- raise ValueError("The input data is empty or the specified variables are not found in the data.")
+ # Ensure Y_out is a Series and retains its name
+ if isinstance(Y_out, (pd.Series, pd.DataFrame)):
+ Y_out.name = Y_name # Retain the response variable's name
+ else:
+ # Convert numpy array to pandas Series and set name
+ Y_out = pd.Series(Y_out, name=Y_name)
+
+ if X_out.empty:
+ raise ValueError("The input data is empty or the specified variables are not found in the data.")
+ if dummies:
+ X_out = process_dummies(X_out)
+
+ if method.lower() == "ols":
model = sm.OLS(Y_out, X_out).fit()
-# if method.lower() == "logistic":
-# if method.lower() == "rf":
-# if method.lower() == "knn":
+ elif method.lower() == "logistic":
+ # Process the response variable to ensure it is numeric or binary
+ Y_out = check_response_and_convert(Y_out)
+ model = sm.GLM(Y_out, X_out, family=sm.families.Binomial()).fit()
+
+ else:
+ raise NotImplementedError(f"Method '{method}' is not implemented. Supported methods: 'ols', 'logistic'.")
+
return model
+
diff --git a/pregress/modeling/summary.py b/pregress/modeling/summary.py
index bd8453f..1965a0b 100644
--- a/pregress/modeling/summary.py
+++ b/pregress/modeling/summary.py
@@ -1,14 +1,15 @@
-from .format_summary import format_summary
-from .print_r_summary import print_r_summary
-from .print_anova_table import print_anova_table
-from .print_stata_summary import print_stata_summary
-from .significance_code import significance_code
+from pregress.modeling.format_summary import format_summary
+from pregress.modeling.print_r_summary import print_r_summary
+from pregress.modeling.print_anova_table import print_anova_table
+from pregress.modeling.print_stata_summary import print_stata_summary
+from pregress.modeling.significance_code import significance_code
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings
from io import StringIO
+
def summary(model, out='simple', level=0.95):
"""
Generates and prints a summary of the regression model fit. The default summary is 'simple',
@@ -33,52 +34,140 @@ def summary(model, out='simple', level=0.95):
if out in ['statsmodels', 'stats']:
print(model.summary(alpha=alpha))
return
+
+ def print_model_type(model):
+ if isinstance(model, sm.regression.linear_model.RegressionResultsWrapper):
+ if isinstance(model.model, sm.OLS):
+ model_type = "ols"
+ elif isinstance(model.model, sm.Logit):
+ model_type = "logit"
+ elif isinstance(model.model, sm.GLM):
+ if isinstance(model.model.family, sm.families.Binomial):
+ model_type = "glm"
+ else:
+ model_type = "glm_nonlogit"
+ else:
+ model_type = "Other Regression Model"
+ else:
+ model_type = "Unsupported model type."
+
+ return model_type
+
+ def summary_ols(model, out='simple', level=0.95):
+
+ alpha = round(1 - level, 5) # Ensure alpha is correctly formatted
+
+ warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20")
+ warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning)
+ results_summary = model.summary(alpha=alpha)
+ results_as_html = results_summary.tables[1].as_html()
+ summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]
+ summary_df = format_summary(summary_df, alpha)
+
+ p_values = model.pvalues
+ conf_intervals = model.conf_int(alpha=alpha)
+ r_squared = model.rsquared
+ adj_r_squared = model.rsquared_adj
+ f_statistic = model.fvalue
+ f_p_value = model.f_pvalue
+ log_likelihood = model.llf
+ aic = model.aic
+ bic = model.bic
+ RSS = np.sum(model.resid**2)
+ df = model.df_resid
+ RSE = np.sqrt(RSS / df)
+ n_obs = int(model.nobs)
+ df_model = model.df_model
+ df_resid = model.df_resid
+ mse_model = model.mse_model
+ mse_resid = model.mse_resid
+
+ if out == 'r':
+ print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value)
+ elif out == 'simple':
+ print("Summary of Regression Analysis:")
+ print("======================================================")
+ print("\nCoefficients:")
+ print("------------------------------------------------------")
+ print(summary_df)
+ print("\nModel Statistics:")
+ print("------------------------------------------------------")
+ print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}")
+ print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}")
+ print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}")
+ print("======================================================")
+ elif out in ['coefficients', 'coef']:
+ print(model.summary(alpha=alpha).tables[1])
+ elif out == 'anova':
+ anova_table = print_anova_table(model)
+ print(anova_table)
+ elif out == 'stata':
+ print_stata_summary(model, summary_df, conf_intervals, level)
+ else:
+ raise ValueError("Unsupported summary type specified.")
- warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20")
- warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning)
- results_summary = model.summary(alpha=alpha)
- results_as_html = results_summary.tables[1].as_html()
- summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]
- summary_df = format_summary(summary_df, alpha)
-
- p_values = model.pvalues
- conf_intervals = model.conf_int(alpha=alpha)
- r_squared = model.rsquared
- adj_r_squared = model.rsquared_adj
- f_statistic = model.fvalue
- f_p_value = model.f_pvalue
- log_likelihood = model.llf
- aic = model.aic
- bic = model.bic
- RSS = np.sum(model.resid**2)
- df = model.df_resid
- RSE = np.sqrt(RSS / df)
- n_obs = int(model.nobs)
- df_model = model.df_model
- df_resid = model.df_resid
- mse_model = model.mse_model
- mse_resid = model.mse_resid
-
- if out == 'r':
- print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value)
- elif out == 'simple':
- print("Summary of Regression Analysis:")
- print("======================================================")
- print("\nCoefficients:")
- print("------------------------------------------------------")
- print(summary_df)
- print("\nModel Statistics:")
- print("------------------------------------------------------")
- print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}")
- print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}")
- print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}")
- print("======================================================")
- elif out in ['coefficients', 'coef']:
- print(model.summary(alpha=alpha).tables[1])
- elif out == 'anova':
- anova_table = print_anova_table(model)
- print(anova_table)
- elif out == 'stata':
- print_stata_summary(model, summary_df, conf_intervals, level)
+ def summary_logistic(model, out='simple', level=0.95):
+ """
+ Generates a summary of a logistic regression model using statsmodels.
+
+ Args:
+ model: The logistic regression model object from statsmodels.
+ out (str): Type of summary output. Options include 'simple', 'statsmodels', 'R', 'STATA',
+ 'coefficients', and 'ANOVA' (not typically applicable).
+ level (float): Confidence level for the confidence intervals. Default is 0.95.
+
+ Returns:
+ Various types of summaries depending on the input type, both printed and returned.
+ """
+
+ # Suppress Warnings
+ warnings.filterwarnings("ignore", category=FutureWarning,
+ message="The bic value is computed using the deviance formula.*")
+ warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ConvergenceWarning)
+
+ alpha = round(1 - level, 5) # Ensure alpha is correctly formatted
+
+ results_summary = model.summary(alpha=alpha)
+ results_as_html = results_summary.tables[1].as_html()
+ summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0]
+ summary_df = format_summary(summary_df, alpha)
+
+ conf_intervals = model.conf_int(alpha=alpha)
+ log_likelihood = model.llf
+ aic = model.aic
+ bic = model.bic
+ pseudo_r_squared = model.pseudo_rsquared(kind='cs')
+ n_obs = int(model.nobs)
+
+ if out == 'r':
+ print("Not currently available for logistic regression.")
+ elif out == 'simple':
+ print("Summary of Logistic Regression Analysis:")
+ print("======================================================")
+ print("\nCoefficients (Odds Ratios):")
+ print("------------------------------------------------------")
+ print(summary_df)
+ print("\nModel Statistics:")
+ print("------------------------------------------------------")
+ print(f"Log-Likelihood: {log_likelihood:.4f} AIC: {aic:.4f}")
+ print(f"Pseudo R-squared: {pseudo_r_squared:.4f} BIC: {bic:.4f}")
+ print("======================================================")
+ elif out in ['coefficients', 'coef']:
+ print(results_summary.tables[1])
+ elif out == 'anova':
+ print("ANOVA table not applicable for logistic regression.")
+ elif out == 'stata':
+ print("Not currently available for logistic regression.")
+ else:
+ raise ValueError("Unsupported summary type specified.")
+
+ return results_summary
+
+ model_type = print_model_type(model)
+
+ if model_type == "ols":
+ summary_ols(model, out, level)
+ elif model_type == "glm":
+ summary_logistic(model, out, level)
else:
- raise ValueError("Unsupported summary type specified.")
+ raise ValueError("Unsupported model type.")
diff --git a/pregress/plots/__pycache__/__init__.cpython-311.pyc b/pregress/plots/__pycache__/__init__.cpython-311.pyc
index 84218b6..52226ce 100644
Binary files a/pregress/plots/__pycache__/__init__.cpython-311.pyc and b/pregress/plots/__pycache__/__init__.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/abline.cpython-311.pyc b/pregress/plots/__pycache__/abline.cpython-311.pyc
new file mode 100644
index 0000000..9d3b53c
Binary files /dev/null and b/pregress/plots/__pycache__/abline.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/barplot.cpython-310.pyc b/pregress/plots/__pycache__/barplot.cpython-310.pyc
index 91341bb..20eda64 100644
Binary files a/pregress/plots/__pycache__/barplot.cpython-310.pyc and b/pregress/plots/__pycache__/barplot.cpython-310.pyc differ
diff --git a/pregress/plots/__pycache__/barplot.cpython-311.pyc b/pregress/plots/__pycache__/barplot.cpython-311.pyc
index f40b6f8..e6ef137 100644
Binary files a/pregress/plots/__pycache__/barplot.cpython-311.pyc and b/pregress/plots/__pycache__/barplot.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/boxplot.cpython-310.pyc b/pregress/plots/__pycache__/boxplot.cpython-310.pyc
index 47f809a..a968b7d 100644
Binary files a/pregress/plots/__pycache__/boxplot.cpython-310.pyc and b/pregress/plots/__pycache__/boxplot.cpython-310.pyc differ
diff --git a/pregress/plots/__pycache__/boxplot.cpython-311.pyc b/pregress/plots/__pycache__/boxplot.cpython-311.pyc
index 84e26e0..c789cee 100644
Binary files a/pregress/plots/__pycache__/boxplot.cpython-311.pyc and b/pregress/plots/__pycache__/boxplot.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/hist.cpython-310.pyc b/pregress/plots/__pycache__/hist.cpython-310.pyc
index 2d65345..6c5e6af 100644
Binary files a/pregress/plots/__pycache__/hist.cpython-310.pyc and b/pregress/plots/__pycache__/hist.cpython-310.pyc differ
diff --git a/pregress/plots/__pycache__/hist.cpython-311.pyc b/pregress/plots/__pycache__/hist.cpython-311.pyc
index ef71685..d45eae3 100644
Binary files a/pregress/plots/__pycache__/hist.cpython-311.pyc and b/pregress/plots/__pycache__/hist.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/hist_res.cpython-310.pyc b/pregress/plots/__pycache__/hist_res.cpython-310.pyc
index 7c020fb..26014c9 100644
Binary files a/pregress/plots/__pycache__/hist_res.cpython-310.pyc and b/pregress/plots/__pycache__/hist_res.cpython-310.pyc differ
diff --git a/pregress/plots/__pycache__/hist_res.cpython-311.pyc b/pregress/plots/__pycache__/hist_res.cpython-311.pyc
new file mode 100644
index 0000000..6902b12
Binary files /dev/null and b/pregress/plots/__pycache__/hist_res.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/hists.cpython-311.pyc b/pregress/plots/__pycache__/hists.cpython-311.pyc
index 8efc5ba..00d69c0 100644
Binary files a/pregress/plots/__pycache__/hists.cpython-311.pyc and b/pregress/plots/__pycache__/hists.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/plot_cook.cpython-311.pyc b/pregress/plots/__pycache__/plot_cook.cpython-311.pyc
new file mode 100644
index 0000000..831805b
Binary files /dev/null and b/pregress/plots/__pycache__/plot_cook.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/plot_cor.cpython-310.pyc b/pregress/plots/__pycache__/plot_cor.cpython-310.pyc
index 3e9bf0d..105936b 100644
Binary files a/pregress/plots/__pycache__/plot_cor.cpython-310.pyc and b/pregress/plots/__pycache__/plot_cor.cpython-310.pyc differ
diff --git a/pregress/plots/__pycache__/plot_cor.cpython-311.pyc b/pregress/plots/__pycache__/plot_cor.cpython-311.pyc
index b9828a8..935b96f 100644
Binary files a/pregress/plots/__pycache__/plot_cor.cpython-311.pyc and b/pregress/plots/__pycache__/plot_cor.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/plot_intervals.cpython-311.pyc b/pregress/plots/__pycache__/plot_intervals.cpython-311.pyc
new file mode 100644
index 0000000..b9b2020
Binary files /dev/null and b/pregress/plots/__pycache__/plot_intervals.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/plot_qq.cpython-311.pyc b/pregress/plots/__pycache__/plot_qq.cpython-311.pyc
new file mode 100644
index 0000000..7d9e765
Binary files /dev/null and b/pregress/plots/__pycache__/plot_qq.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/plot_res.cpython-310.pyc b/pregress/plots/__pycache__/plot_res.cpython-310.pyc
index 58c9355..3a26610 100644
Binary files a/pregress/plots/__pycache__/plot_res.cpython-310.pyc and b/pregress/plots/__pycache__/plot_res.cpython-310.pyc differ
diff --git a/pregress/plots/__pycache__/plot_res.cpython-311.pyc b/pregress/plots/__pycache__/plot_res.cpython-311.pyc
new file mode 100644
index 0000000..7d72229
Binary files /dev/null and b/pregress/plots/__pycache__/plot_res.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/plot_xy.cpython-311.pyc b/pregress/plots/__pycache__/plot_xy.cpython-311.pyc
new file mode 100644
index 0000000..5af922e
Binary files /dev/null and b/pregress/plots/__pycache__/plot_xy.cpython-311.pyc differ
diff --git a/pregress/plots/__pycache__/plots.cpython-311.pyc b/pregress/plots/__pycache__/plots.cpython-311.pyc
index 0d79ef1..a28c279 100644
Binary files a/pregress/plots/__pycache__/plots.cpython-311.pyc and b/pregress/plots/__pycache__/plots.cpython-311.pyc differ
diff --git a/pregress/plots/barplot.py b/pregress/plots/barplot.py
index bd05dd4..e87f39d 100644
--- a/pregress/plots/barplot.py
+++ b/pregress/plots/barplot.py
@@ -22,6 +22,9 @@ def barplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Barplots
Returns:
None. The function creates and shows bar plots.
"""
+ if isinstance(formula, pd.DataFrame):
+ data = formula
+ formula = None
if formula is not None:
formula = formula + "+0"
Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
diff --git a/pregress/plots/boxplot.py b/pregress/plots/boxplot.py
index f60f798..8e7f5ed 100644
--- a/pregress/plots/boxplot.py
+++ b/pregress/plots/boxplot.py
@@ -9,10 +9,10 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots
Generates and prints boxplots for all numeric variables specified in the formula or all numeric variables in the data if no formula is provided.
Args:
- formula (str, optional): Formula to define the model (dependent ~ independent).
+ formula (str, optional): Formula to define the model (Y ~ X).
data (DataFrame, optional): Data frame containing the data.
- xcolor (str, optional): Color of the boxplots for the independent variables.
- ycolor (str, optional): Color of the boxplots for the dependent variable.
+ xcolor (str, optional): Color of the boxplots for the predictor variables.
+ ycolor (str, optional): Color of the boxplots for the response variable.
main (str, optional): Title of the plot.
xlab (str, optional): Label for the x-axis.
ylab (str, optional): Label for the y-axis.
@@ -67,4 +67,4 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots
if subplot is None:
plt.show()
plt.clf()
- plt.close()
\ No newline at end of file
+ plt.close()
diff --git a/pregress/plots/hist.py b/pregress/plots/hist.py
index ac8444b..1121639 100644
--- a/pregress/plots/hist.py
+++ b/pregress/plots/hist.py
@@ -3,8 +3,9 @@
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm as normal_dist
+import inspect
-def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot = None):
+def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot=None):
"""
Generates and prints a histogram for a given vector.
@@ -16,6 +17,7 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None,
main (str, optional): Title for the histogram.
xlab (str, optional): Label for the x-axis.
ylab (str, optional): Label for the y-axis.
+ subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index).
Returns:
None. The function creates and shows the histogram.
@@ -31,9 +33,11 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None,
xlab = [var_name for var_name, var_val in callers_local_vars if var_val is vector]
xlab = xlab[0] if xlab else 'Variable'
- # Clear any existing plots
- plt.clf()
- plt.close()
+ # If a subplot is specified, create a subplot within the grid
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure()
# Create the histogram
sns.histplot(vector, bins=bins, kde=False, color=color, edgecolor='black')
@@ -51,13 +55,6 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None,
plt.xlabel(xlab)
plt.ylabel(ylab)
- # Show the plot if subplot is not specified or if it is the last subplot
+ # Only show the plot if it's not part of a subplot
if subplot is None:
plt.show()
- plt.clf()
- plt.close()
-
-
-
-
-
diff --git a/pregress/plots/hist_res.py b/pregress/plots/hist_res.py
index ad96b34..0393dee 100644
--- a/pregress/plots/hist_res.py
+++ b/pregress/plots/hist_res.py
@@ -1,23 +1,33 @@
-from .hist import hist
from pregress.modeling.fit import fit
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats # Import for statistical functions
-def hist_res(model, subplot=None):
+
+def hist_res(model, main="Histogram of Residuals", xlab="Residuals", ylab="Density", subplot=None):
"""
Plots a histogram of the residuals of a fitted statsmodels regression model and overlays a normal distribution curve.
Args:
model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
+ main (str, optional): Title for the histogram plot.
+ xlab (str, optional): Label for the x-axis.
+ ylab (str, optional): Label for the y-axis.
+ subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created.
Returns:
None. Displays a histogram of residuals with a normal distribution curve.
"""
-
+
# Calculate residuals
residuals = model.resid
+ # If a subplot is specified, create the subplot; otherwise, create a new figure
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure()
+
# Plot histogram of the residuals
plt.hist(residuals, bins=30, color='blue', alpha=0.7, density=True, label='Residuals Histogram')
@@ -32,15 +42,13 @@ def hist_res(model, subplot=None):
# Plot the normal distribution curve
plt.plot(x, p, 'k', linewidth=2, label='Normal Distribution')
- # Update the title
- plt.title('Histogram of Residuals')
-
- # Add labels and move the legend to the upper left corner
- plt.xlabel('Residuals')
- plt.ylabel('Density')
+ # Set title and labels using the new arguments
+ plt.title(main)
+ plt.xlabel(xlab)
+ plt.ylabel(ylab)
plt.legend(loc='upper left')
- # Show the plot if subplot is not specified
+ # Show the plot only if no subplot is provided
if subplot is None:
plt.show()
plt.clf()
diff --git a/pregress/plots/hists.py b/pregress/plots/hists.py
index cf08adb..2ab2d88 100644
--- a/pregress/plots/hists.py
+++ b/pregress/plots/hists.py
@@ -6,27 +6,42 @@
from scipy.stats import norm as normal_dist
import warnings
-def hists(formula, data=None, bins=30, xcolor="blue", ycolor="red", norm=False, layout="matrix", subplot=None):
+
+def hists(input_data=None, data=None, bins=30, xcolor="blue", ycolor="red", norm=False, layout="matrix",
+ main="Distribution of Variables", xlab=None, ylab="Frequency", subplot=None):
"""
- Generates and prints histograms for all numeric variables specified in the formula.
+ Generates and prints histograms for all numeric variables specified in the formula or all numeric variables in the DataFrame.
Args:
- formula (str): Formula to define the model (dependent ~ independent).
- data (DataFrame, optional): Data frame containing the data.
+ input_data (str or DataFrame): Formula to define the model (dependent ~ independent), a single column name, or a DataFrame containing the data.
+ data (DataFrame, optional): Data frame containing the data if a formula is provided.
+ bins (int, optional): Number of bins for the histograms.
xcolor (str, optional): Color of the histograms for the independent variables.
ycolor (str, optional): Color of the histograms for the dependent variable.
norm (bool, optional): Whether to include a normal distribution line.
layout (str, optional): Layout of the histograms - "column", "row", or "matrix".
+ main (str, optional): Main title for the plot.
+ xlab (str, optional): Label for the x-axis. Defaults to each variable name if not provided.
+ ylab (str, optional): Label for the y-axis.
+ subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index).
Returns:
None. The function creates and shows histograms.
"""
- formula = formula + "+0"
- Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
-
- # Combine Y and X data for histograms
- plot_data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1)
+ # Case 1: Handle single variable input without "~"
+ if isinstance(input_data, str) and '~' not in input_data:
+ plot_data = pd.DataFrame({input_data: data[input_data]})
+ Y_name = None
+ # Case 2: Directly given DataFrame
+ elif isinstance(input_data, pd.DataFrame):
+ plot_data = input_data.select_dtypes(include=[np.number])
+ Y_name = None
+ # Case 3: Formula provided
+ else:
+ formula = input_data + "+0"
+ Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
+ plot_data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1)
# Replace infinite values with NaN
plot_data.replace([np.inf, -np.inf], np.nan, inplace=True)
@@ -47,6 +62,8 @@ def hists(formula, data=None, bins=30, xcolor="blue", ycolor="red", norm=False,
fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 5 * nrows))
axes = np.array(axes).reshape(-1) # Flatten the axes array for easy iteration
+ fig.suptitle(main) # Set the main title for the entire figure
+
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=FutureWarning)
@@ -54,22 +71,26 @@ def hists(formula, data=None, bins=30, xcolor="blue", ycolor="red", norm=False,
ax = axes[i]
color = ycolor if var == Y_name else xcolor
sns.histplot(plot_data[var], bins=bins, kde=False, color=color, ax=ax, edgecolor='black')
+
if norm:
mean = plot_data[var].mean()
std = plot_data[var].std()
x = np.linspace(plot_data[var].min(), plot_data[var].max(), 100)
p = normal_dist.pdf(x, mean, std)
- ax.plot(x, p * (len(plot_data[var]) * np.diff(np.histogram(plot_data[var], bins=30)[1])[0]), 'k', linewidth=2)
+ ax.plot(x, p * (len(plot_data[var]) * np.diff(np.histogram(plot_data[var], bins=30)[1])[0]), 'k',
+ linewidth=2)
+
+ # Set individual titles and labels using provided arguments
ax.set_title(f'Histogram of {var}')
- ax.set_xlabel(var)
- ax.set_ylabel('Frequency')
+ ax.set_xlabel(xlab if xlab else var)
+ ax.set_ylabel(ylab)
# Remove any unused subplots in the matrix layout
for j in range(i + 1, len(axes)):
fig.delaxes(axes[j])
- plt.tight_layout()
-
+ plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout with space for the main title
+
# Show the plot if subplot is not specified
if subplot is None:
plt.show()
diff --git a/pregress/plots/plot_cook.py b/pregress/plots/plot_cook.py
index 79aa7b7..1388b9a 100644
--- a/pregress/plots/plot_cook.py
+++ b/pregress/plots/plot_cook.py
@@ -3,13 +3,17 @@
import numpy as np
from statsmodels.graphics.gofplots import ProbPlot
-def plot_cook(model, threshold=0.5, subplot=None):
+def plot_cook(model, threshold=0.5, main="Cook's Distance Plot", xlab="Observation Index", ylab="Cook's Distance", subplot=None):
"""
Plots Cook's Distance for each observation in a fitted statsmodels regression model to identify influential points.
Args:
model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
threshold (float, optional): The threshold for Cook's Distance to highlight influential points. Default is 0.5.
+ main (str, optional): Title for the plot.
+ xlab (str, optional): Label for the x-axis.
+ ylab (str, optional): Label for the y-axis.
+ subplot (tuple or None, optional): A tuple specifying the subplot grid (nrows, ncols, index) or None to create a new figure.
Returns:
None. Displays a plot of Cook's Distance for each observation.
@@ -18,20 +22,25 @@ def plot_cook(model, threshold=0.5, subplot=None):
influence = model.get_influence()
cooks_d = influence.cooks_distance[0]
+ # If a subplot is specified, create the subplot within the given grid; otherwise, create a new figure
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure(figsize=(8, 6))
+
# Create the plot
- fig, ax = plt.subplots(figsize=(8, 6)) if subplot is None else subplot
+ ax = plt.gca() # Get the current axis (either from subplot or new figure)
ax.stem(np.arange(len(cooks_d)), cooks_d, markerfmt=",")
- ax.set_xlabel('Observation Index')
- ax.set_ylabel("Cook's Distance")
- ax.set_title("Cook's Distance Plot")
+ ax.set_xlabel(xlab)
+ ax.set_ylabel(ylab)
+ ax.set_title(main)
# Adding a reference line for the specified threshold
ax.axhline(y=threshold, linestyle='--', color='red', label=f'Influence threshold ({threshold})')
ax.legend()
- # Show the plot if subplot is not specified
+ # Show the plot only if no subplot is provided
if subplot is None:
plt.show()
plt.clf()
plt.close()
-
diff --git a/pregress/plots/plot_cor.py b/pregress/plots/plot_cor.py
index 30fb531..bc7e3e5 100644
--- a/pregress/plots/plot_cor.py
+++ b/pregress/plots/plot_cor.py
@@ -2,37 +2,70 @@
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
+from pregress.modeling.parse_formula import parse_formula
-def plot_cor(df, main='Correlation Matrix', subplot=None):
+
+def plot_cor(formula, data=None, main='Correlation Matrix', xlab='Variables', ylab='Variables', subplot=None, **kwargs):
"""
- Generates a heatmap for the correlation matrix of a DataFrame.
+ Generates a heatmap for the correlation matrix of a dataframe.
Args:
- df (pandas.DataFrame): The DataFrame for which to compute the correlation matrix.
+ formula (str or pandas.DataFrame): The formula or dataframe for which to compute the correlation matrix.
+ data (pandas.DataFrame, optional): The dataframe for formula evaluation if a formula is provided.
main (str, optional): Main title of the plot.
xlab (str, optional): Label for the x-axis.
ylab (str, optional): Label for the y-axis.
+ subplot (tuple, optional): Subplot for embedding the heatmap (nrows, ncols, index).
+ kwargs: Additional keyword arguments for sns.heatmap() (e.g., annot, cmap, square, vmax, vmin, linewidths, etc.)
Returns:
None. Displays the heatmap.
"""
+
+ if isinstance(formula, pd.DataFrame):
+ data = formula
+ formula = None
+
+ if formula is not None:
+ formula = formula + "+0"
+ Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
+ # Combine Y and X data for the correlation matrix
+ data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1)
+
# Calculate the correlation matrix
- corr_matrix = df.corr()
+ corr_matrix = data.corr()
# Set the diagonal elements to NaN to make them white
np.fill_diagonal(corr_matrix.values, np.nan)
- # Create a custom colormap with black for NaN values
- cmap = sns.color_palette("coolwarm", as_cmap=True)
- cmap.set_bad(color='black')
+ # Set default values if not already provided in kwargs
+ kwargs.setdefault('annot', True)
+ kwargs.setdefault('square', True)
+ kwargs.setdefault('vmax', 1)
+ kwargs.setdefault('vmin', -1)
+ kwargs.setdefault('linewidths', 0.5)
+
+ # If cmap is not provided in kwargs, set a default cmap with NaN handling
+ if 'cmap' not in kwargs:
+ cmap = sns.color_palette("coolwarm", as_cmap=True)
+ cmap.set_bad(color='black') # Make NaN values appear in black
+ kwargs['cmap'] = cmap
+
+ # If a subplot is specified, use it; otherwise, create a new figure
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure(figsize=(8, 6))
- # Draw the heatmap
- sns.heatmap(corr_matrix, annot=True, cmap=cmap, vmax=1, vmin=-1, square=True, linewidths=.5)
+ # Draw the heatmap with specified and default kwargs
+ sns.heatmap(corr_matrix, **kwargs)
- # Add main title
+ # Set main title, x-axis label, and y-axis label
plt.title(main, fontsize=18)
+ plt.xlabel(xlab)
+ plt.ylabel(ylab)
- # Rotate the tick labels for better readability
+ # Rotate the tick labels for readability
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
@@ -40,4 +73,4 @@ def plot_cor(df, main='Correlation Matrix', subplot=None):
if subplot is None:
plt.show()
plt.clf()
- plt.close()
+ plt.close()
\ No newline at end of file
diff --git a/pregress/plots/plot_res.py b/pregress/plots/plot_res.py
index dfd5e16..a01e95c 100644
--- a/pregress/plots/plot_res.py
+++ b/pregress/plots/plot_res.py
@@ -2,37 +2,45 @@
import numpy as np
import scipy.stats as stats
-def plot_res(model, subplot=None):
+
+def plot_res(model, main="Residual Plot", xlab="Fitted values", ylab="Residuals", subplot=None):
"""
Plots the residuals of a fitted statsmodels regression model.
Args:
model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
+ main (str, optional): Title for the plot.
+ xlab (str, optional): Label for the x-axis.
+ ylab (str, optional): Label for the y-axis.
+ subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created.
Returns:
None. Displays a residual plot.
"""
-
+
# Calculate residuals
residuals = model.resid
# Calculate fitted values
fitted = model.predict()
+ # If a subplot is specified, create the subplot; otherwise, create a new figure
+ if subplot:
+ plt.subplot(*subplot)
+ else:
+ plt.figure()
+
# Create the residual plot
plt.scatter(fitted, residuals, color='blue')
plt.axhline(0, color='red', linestyle='--') # Adds a horizontal line at zero
- plt.xlabel('Fitted values')
- plt.ylabel('Residuals')
- plt.title('Residual Plot')
-
- # Show the plot if subplot is not specified
+
+ # Setting the title and labels using provided arguments
+ plt.xlabel(xlab)
+ plt.ylabel(ylab)
+ plt.title(main)
+
+ # Show the plot only if no subplot is provided
if subplot is None:
plt.show()
plt.clf()
plt.close()
-
-
-
-
-
diff --git a/pregress/plots/plots.py b/pregress/plots/plots.py
index 6dabd01..0648fa0 100644
--- a/pregress/plots/plots.py
+++ b/pregress/plots/plots.py
@@ -4,19 +4,21 @@
import matplotlib.pyplot as plt
import pandas as pd
-def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescolor = "black", subplot=None):
+def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescolor="black", main="Scatter Plot Matrix"):
"""
- Generates and prints a plot of each scatter plot corresponding to all X and Y values.
+ Generates and displays a scatter plot matrix corresponding to all X and Y values.
Args:
formula (str): Formula to define the model (dependent ~ independent).
data (DataFrame, optional): Data frame containing the data.
xcolor (str, optional): Color of the points in the scatter plot among both x variables.
ycolor (str, optional): Color of the points in the scatter plot including the y variable.
- lines (bool, optional): Whether or not to include the regression line in each plot.
+ lines (bool, optional): Whether to include the regression line in each plot.
+ linescolor (str, optional): Color of the regression lines.
+ main (str, optional): Main title of the scatter plot matrix.
Returns:
- None. The function creates and shows a plot.
+ None. The function creates and shows the plot.
"""
# Clear any existing plots
plt.clf()
@@ -39,6 +41,9 @@ def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescol
# Create the pairplot
pair_plot = sns.pairplot(plot_data, diag_kind="kde")
+ # Set main title
+ plt.suptitle(main, fontsize=18)
+
# Customizing scatter plot colors
for i in range(len(plot_data.columns)):
for j in range(len(plot_data.columns)):
@@ -72,14 +77,7 @@ def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescol
ax=pair_plot.axes[i, j], scatter_kws={'color': xcolor},
line_kws={'color': linescolor}, ci=None, truncate=False)
- # Show the plot if subplot is not specified
- if subplot is None:
- plt.show()
- plt.clf()
- plt.close()
-
-
-
-
-
-
+ # Display the plot
+ plt.show()
+ plt.clf()
+ plt.close()
diff --git a/setup.cfg b/setup.cfg
index fc04ed7..7f3e404 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
[metadata]
name = pregress
-version = 1.0.2
+version = 1.0.4
author = Daniel McGibney
author_email = dmcgibney@bus.miami.edu
description = Python Regression Analysis.
diff --git a/setup.py b/setup.py
index 3007461..352a7cc 100644
--- a/setup.py
+++ b/setup.py
@@ -11,7 +11,7 @@
setup(
name='pregress',
- version='1.0.2',
+ version='1.0.4',
packages=find_packages(include=['pregress', 'pregress.*']),
install_requires=[
'matplotlib', 'pandas', 'numpy', 'statsmodels', 'seaborn', 'scikit-learn',