diff --git a/.DS_Store b/.DS_Store index 87ddc21..848990e 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..26d3352 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ +# Default ignored files +/shelf/ +/workspace.xml diff --git a/.idea/PRegress_Repo.iml b/.idea/PRegress_Repo.iml new file mode 100644 index 0000000..0070e87 --- /dev/null +++ b/.idea/PRegress_Repo.iml @@ -0,0 +1,7 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..41c2a23 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/build/lib/pregress/modeling/box_cox.py b/build/lib/pregress/modeling/box_cox.py index 5ce1bd5..c41788f 100644 --- a/build/lib/pregress/modeling/box_cox.py +++ b/build/lib/pregress/modeling/box_cox.py @@ -1,44 +1,49 @@ import numpy as np import matplotlib.pyplot as plt from scipy.stats import boxcox, boxcox_llf -import statsmodels.api as sm -import statsmodels.formula.api as smf def box_cox(model): - """ - Perform a Box-Cox transformation on the response variable of a given statsmodels regression results object, - output a plot of the log-likelihood as a function of lambda, the fitted lambda, and the 95% confidence interval. - - Args: - model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model. - """ - # Extract the response variable y = model.model.endog - - # Perform the Box-Cox transformation + if np.any(y <= 0): + raise ValueError("All values in the response variable must be positive for Box-Cox transformation.") + y_transformed, fitted_lambda = boxcox(y) - - # Calculate the log-likelihood for different lambda values using boxcox_llf - lambdas = np.linspace(-2, 2, 100) + + # Calculate lambdas from -3 to 3 for better CI accuracy + lambdas = np.linspace(-3, 3, 100) log_likelihood = [boxcox_llf(lmbda, y) for lmbda in lambdas] - - # Calculate the 95% confidence interval + + # Plot lambdas from -2.1 to 2.1 + plot_lambdas = lambdas[(lambdas >= -2.1) & (lambdas <= 2.1)] + plot_log_likelihood = [boxcox_llf(lmbda, y) for lmbda in plot_lambdas] + max_log_likelihood = boxcox_llf(fitted_lambda, y) ci_cutoff = max_log_likelihood - 1.92 # Chi-squared distribution cutoff for 95% CI (1 degree of freedom) ci_lambdas = lambdas[np.array(log_likelihood) >= ci_cutoff] - - lambda_lower = ci_lambdas[0] - lambda_upper = ci_lambdas[-1] - - # Plot the log-likelihood as a function of lambda + plt.figure(figsize=(10, 6)) - plt.plot(lambdas, log_likelihood, label='Log-Likelihood') - plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Fitted Lambda: {fitted_lambda:.4f}') - plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}') - plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}') + + # Plot the restricted range of log-likelihood from -2.1 to 2.1 + plt.plot(plot_lambdas, plot_log_likelihood, label='Log-Likelihood Function') + + # Set xlim to focus on the typical range of -2 to 2 + plt.xlim([-2, 2]) + + # Set ylim based exactly on the min and max log-likelihood without additional padding + plt.ylim([min(plot_log_likelihood), max(plot_log_likelihood)+.05* (max(plot_log_likelihood) - min(plot_log_likelihood))]) + + if -2 <= fitted_lambda <= 2: + lambda_lower = ci_lambdas[0] + lambda_upper = ci_lambdas[-1] + plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}') + plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Best Lambda: {fitted_lambda:.4f}') + plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}') + else: + print(f"The fitted_lambda is {fitted_lambda:.4f}, which is outside the typical range of -2 to 2. CI lines not plotted.") + plt.xlabel('Lambda') plt.ylabel('Log-Likelihood') - plt.title('Box-Cox Transformation Log-Likelihood with 95% CI') - plt.legend(loc='upper left') + plt.title('Log-Likelihood for Box-Cox Transformation') + plt.legend(loc='lower right') plt.grid(True) plt.show() diff --git a/build/lib/pregress/modeling/fit.py b/build/lib/pregress/modeling/fit.py index a6bc58d..03dde46 100644 --- a/build/lib/pregress/modeling/fit.py +++ b/build/lib/pregress/modeling/fit.py @@ -2,7 +2,7 @@ import statsmodels.api as sm import pandas as pd -def fit(formula, data=None, method = "ols", dummies = True): +def fit(formula: str, data: pd.DataFrame = None, method: str = "ols", dummies: bool = True): """ Fits a statistical model based on a specified formula and data. @@ -10,44 +10,69 @@ def fit(formula, data=None, method = "ols", dummies = True): - formula (str): A string representing the statistical formula (e.g., 'Y ~ X1 + X2 - X3'). - data (DataFrame, optional): The dataset containing the variables specified in the formula. - method (str, optional): The method used for fitting the model. Defaults to 'ols' (Ordinary Least Squares). - Other methods can be implemented, such as logistic regression, random forest, etc. + Supported methods: 'ols' for linear regression, 'logistic' for logistic regression. - dummies (bool, optional): A boolean indicating whether to automatically create dummy variables for categorical predictors. Defaults to True. Returns: - - model (statsmodels object): The fitted model object, which can be used for further analysis, such as + - model (statsmodels object): The fitted model object, which can be used for further analysis, such as making predictions or evaluating model performance. Raises: - ValueError: If the input data is empty or the specified variables are not found in the data. + - NotImplementedError: If an unsupported method is specified. Notes: - - The function currently supports OLS (Ordinary Least Squares) regression. Additional methods like logistic - regression, random forest, and k-nearest neighbors can be added as needed. - - The 'parse_formula' function is used to parse the formula and extract the response and predictor variables - from the dataset. - - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator - variables, with the first category dropped to avoid multicollinearity. Additionally, binary variables - (True/False) are converted to numeric (0/1) values. + - The function currently supports OLS (Ordinary Least Squares) and logistic regression. + Additional methods like random forest or k-nearest neighbors could be added as needed. + - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator + variables, with the first category dropped to avoid multicollinearity. Binary variables (True/False) are + converted to numeric (0/1) values. """ - + + def process_dummies(X_out): + """Helper function to handle dummy variables and binary conversions.""" + X_out = pd.get_dummies(X_out, drop_first=True) + + # Convert binary variables (True/False) to numeric (0/1) + binary_columns = X_out.select_dtypes(include=['bool']).columns + X_out[binary_columns] = X_out[binary_columns].astype(int) + return X_out + + def check_response_and_convert(Y_out): + """Convert categorical response variable to dummies if necessary.""" + if not pd.api.types.is_numeric_dtype(Y_out): + Y_out = pd.get_dummies(Y_out, drop_first=True) + if Y_out.shape[1] > 1: + raise ValueError("Response variable was converted to multiple columns, indicating it is multi-class. " + "This function currently supports binary response variables only.") + return Y_out + Y_name, X_names, Y_out, X_out = parse_formula(formula, data) - - if method.lower() == "ols": - if dummies: - - X_out = pd.get_dummies(X_out, drop_first=True) - - # Convert binary variables (True/False) to numeric (0/1) - binary_columns = X_out.select_dtypes(include=['bool']).columns - X_out[binary_columns] = X_out[binary_columns].astype(int) - if X_out.empty: - raise ValueError("The input data is empty or the specified variables are not found in the data.") + # Ensure Y_out is a Series and retains its name + if isinstance(Y_out, (pd.Series, pd.DataFrame)): + Y_out.name = Y_name # Retain the response variable's name + else: + # Convert numpy array to pandas Series and set name + Y_out = pd.Series(Y_out, name=Y_name) + + if X_out.empty: + raise ValueError("The input data is empty or the specified variables are not found in the data.") + if dummies: + X_out = process_dummies(X_out) + + if method.lower() == "ols": model = sm.OLS(Y_out, X_out).fit() -# if method.lower() == "logistic": -# if method.lower() == "rf": -# if method.lower() == "knn": + elif method.lower() == "logistic": + # Process the response variable to ensure it is numeric or binary + Y_out = check_response_and_convert(Y_out) + model = sm.GLM(Y_out, X_out, family=sm.families.Binomial()).fit() + + else: + raise NotImplementedError(f"Method '{method}' is not implemented. Supported methods: 'ols', 'logistic'.") + return model + diff --git a/build/lib/pregress/modeling/summary.py b/build/lib/pregress/modeling/summary.py index bd8453f..1965a0b 100644 --- a/build/lib/pregress/modeling/summary.py +++ b/build/lib/pregress/modeling/summary.py @@ -1,14 +1,15 @@ -from .format_summary import format_summary -from .print_r_summary import print_r_summary -from .print_anova_table import print_anova_table -from .print_stata_summary import print_stata_summary -from .significance_code import significance_code +from pregress.modeling.format_summary import format_summary +from pregress.modeling.print_r_summary import print_r_summary +from pregress.modeling.print_anova_table import print_anova_table +from pregress.modeling.print_stata_summary import print_stata_summary +from pregress.modeling.significance_code import significance_code import numpy as np import pandas as pd import statsmodels.api as sm import warnings from io import StringIO + def summary(model, out='simple', level=0.95): """ Generates and prints a summary of the regression model fit. The default summary is 'simple', @@ -33,52 +34,140 @@ def summary(model, out='simple', level=0.95): if out in ['statsmodels', 'stats']: print(model.summary(alpha=alpha)) return + + def print_model_type(model): + if isinstance(model, sm.regression.linear_model.RegressionResultsWrapper): + if isinstance(model.model, sm.OLS): + model_type = "ols" + elif isinstance(model.model, sm.Logit): + model_type = "logit" + elif isinstance(model.model, sm.GLM): + if isinstance(model.model.family, sm.families.Binomial): + model_type = "glm" + else: + model_type = "glm_nonlogit" + else: + model_type = "Other Regression Model" + else: + model_type = "Unsupported model type." + + return model_type + + def summary_ols(model, out='simple', level=0.95): + + alpha = round(1 - level, 5) # Ensure alpha is correctly formatted + + warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20") + warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning) + results_summary = model.summary(alpha=alpha) + results_as_html = results_summary.tables[1].as_html() + summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0] + summary_df = format_summary(summary_df, alpha) + + p_values = model.pvalues + conf_intervals = model.conf_int(alpha=alpha) + r_squared = model.rsquared + adj_r_squared = model.rsquared_adj + f_statistic = model.fvalue + f_p_value = model.f_pvalue + log_likelihood = model.llf + aic = model.aic + bic = model.bic + RSS = np.sum(model.resid**2) + df = model.df_resid + RSE = np.sqrt(RSS / df) + n_obs = int(model.nobs) + df_model = model.df_model + df_resid = model.df_resid + mse_model = model.mse_model + mse_resid = model.mse_resid + + if out == 'r': + print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value) + elif out == 'simple': + print("Summary of Regression Analysis:") + print("======================================================") + print("\nCoefficients:") + print("------------------------------------------------------") + print(summary_df) + print("\nModel Statistics:") + print("------------------------------------------------------") + print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}") + print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}") + print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}") + print("======================================================") + elif out in ['coefficients', 'coef']: + print(model.summary(alpha=alpha).tables[1]) + elif out == 'anova': + anova_table = print_anova_table(model) + print(anova_table) + elif out == 'stata': + print_stata_summary(model, summary_df, conf_intervals, level) + else: + raise ValueError("Unsupported summary type specified.") - warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20") - warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning) - results_summary = model.summary(alpha=alpha) - results_as_html = results_summary.tables[1].as_html() - summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0] - summary_df = format_summary(summary_df, alpha) - - p_values = model.pvalues - conf_intervals = model.conf_int(alpha=alpha) - r_squared = model.rsquared - adj_r_squared = model.rsquared_adj - f_statistic = model.fvalue - f_p_value = model.f_pvalue - log_likelihood = model.llf - aic = model.aic - bic = model.bic - RSS = np.sum(model.resid**2) - df = model.df_resid - RSE = np.sqrt(RSS / df) - n_obs = int(model.nobs) - df_model = model.df_model - df_resid = model.df_resid - mse_model = model.mse_model - mse_resid = model.mse_resid - - if out == 'r': - print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value) - elif out == 'simple': - print("Summary of Regression Analysis:") - print("======================================================") - print("\nCoefficients:") - print("------------------------------------------------------") - print(summary_df) - print("\nModel Statistics:") - print("------------------------------------------------------") - print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}") - print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}") - print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}") - print("======================================================") - elif out in ['coefficients', 'coef']: - print(model.summary(alpha=alpha).tables[1]) - elif out == 'anova': - anova_table = print_anova_table(model) - print(anova_table) - elif out == 'stata': - print_stata_summary(model, summary_df, conf_intervals, level) + def summary_logistic(model, out='simple', level=0.95): + """ + Generates a summary of a logistic regression model using statsmodels. + + Args: + model: The logistic regression model object from statsmodels. + out (str): Type of summary output. Options include 'simple', 'statsmodels', 'R', 'STATA', + 'coefficients', and 'ANOVA' (not typically applicable). + level (float): Confidence level for the confidence intervals. Default is 0.95. + + Returns: + Various types of summaries depending on the input type, both printed and returned. + """ + + # Suppress Warnings + warnings.filterwarnings("ignore", category=FutureWarning, + message="The bic value is computed using the deviance formula.*") + warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ConvergenceWarning) + + alpha = round(1 - level, 5) # Ensure alpha is correctly formatted + + results_summary = model.summary(alpha=alpha) + results_as_html = results_summary.tables[1].as_html() + summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0] + summary_df = format_summary(summary_df, alpha) + + conf_intervals = model.conf_int(alpha=alpha) + log_likelihood = model.llf + aic = model.aic + bic = model.bic + pseudo_r_squared = model.pseudo_rsquared(kind='cs') + n_obs = int(model.nobs) + + if out == 'r': + print("Not currently available for logistic regression.") + elif out == 'simple': + print("Summary of Logistic Regression Analysis:") + print("======================================================") + print("\nCoefficients (Odds Ratios):") + print("------------------------------------------------------") + print(summary_df) + print("\nModel Statistics:") + print("------------------------------------------------------") + print(f"Log-Likelihood: {log_likelihood:.4f} AIC: {aic:.4f}") + print(f"Pseudo R-squared: {pseudo_r_squared:.4f} BIC: {bic:.4f}") + print("======================================================") + elif out in ['coefficients', 'coef']: + print(results_summary.tables[1]) + elif out == 'anova': + print("ANOVA table not applicable for logistic regression.") + elif out == 'stata': + print("Not currently available for logistic regression.") + else: + raise ValueError("Unsupported summary type specified.") + + return results_summary + + model_type = print_model_type(model) + + if model_type == "ols": + summary_ols(model, out, level) + elif model_type == "glm": + summary_logistic(model, out, level) else: - raise ValueError("Unsupported summary type specified.") + raise ValueError("Unsupported model type.") diff --git a/build/lib/pregress/plots/barplot.py b/build/lib/pregress/plots/barplot.py index bd05dd4..e87f39d 100644 --- a/build/lib/pregress/plots/barplot.py +++ b/build/lib/pregress/plots/barplot.py @@ -22,6 +22,9 @@ def barplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Barplots Returns: None. The function creates and shows bar plots. """ + if isinstance(formula, pd.DataFrame): + data = formula + formula = None if formula is not None: formula = formula + "+0" Y_name, X_names, Y_out, X_out = parse_formula(formula, data) diff --git a/build/lib/pregress/plots/boxplot.py b/build/lib/pregress/plots/boxplot.py index f60f798..8e7f5ed 100644 --- a/build/lib/pregress/plots/boxplot.py +++ b/build/lib/pregress/plots/boxplot.py @@ -9,10 +9,10 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots Generates and prints boxplots for all numeric variables specified in the formula or all numeric variables in the data if no formula is provided. Args: - formula (str, optional): Formula to define the model (dependent ~ independent). + formula (str, optional): Formula to define the model (Y ~ X). data (DataFrame, optional): Data frame containing the data. - xcolor (str, optional): Color of the boxplots for the independent variables. - ycolor (str, optional): Color of the boxplots for the dependent variable. + xcolor (str, optional): Color of the boxplots for the predictor variables. + ycolor (str, optional): Color of the boxplots for the response variable. main (str, optional): Title of the plot. xlab (str, optional): Label for the x-axis. ylab (str, optional): Label for the y-axis. @@ -67,4 +67,4 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots if subplot is None: plt.show() plt.clf() - plt.close() \ No newline at end of file + plt.close() diff --git a/build/lib/pregress/plots/hist.py b/build/lib/pregress/plots/hist.py index ac8444b..1121639 100644 --- a/build/lib/pregress/plots/hist.py +++ b/build/lib/pregress/plots/hist.py @@ -3,8 +3,9 @@ import matplotlib.pyplot as plt import seaborn as sns from scipy.stats import norm as normal_dist +import inspect -def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot = None): +def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot=None): """ Generates and prints a histogram for a given vector. @@ -16,6 +17,7 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, main (str, optional): Title for the histogram. xlab (str, optional): Label for the x-axis. ylab (str, optional): Label for the y-axis. + subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). Returns: None. The function creates and shows the histogram. @@ -31,9 +33,11 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, xlab = [var_name for var_name, var_val in callers_local_vars if var_val is vector] xlab = xlab[0] if xlab else 'Variable' - # Clear any existing plots - plt.clf() - plt.close() + # If a subplot is specified, create a subplot within the grid + if subplot: + plt.subplot(*subplot) + else: + plt.figure() # Create the histogram sns.histplot(vector, bins=bins, kde=False, color=color, edgecolor='black') @@ -51,13 +55,6 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, plt.xlabel(xlab) plt.ylabel(ylab) - # Show the plot if subplot is not specified or if it is the last subplot + # Only show the plot if it's not part of a subplot if subplot is None: plt.show() - plt.clf() - plt.close() - - - - - diff --git a/build/lib/pregress/plots/hist_res.py b/build/lib/pregress/plots/hist_res.py index ad96b34..0393dee 100644 --- a/build/lib/pregress/plots/hist_res.py +++ b/build/lib/pregress/plots/hist_res.py @@ -1,23 +1,33 @@ -from .hist import hist from pregress.modeling.fit import fit import numpy as np import matplotlib.pyplot as plt from scipy import stats # Import for statistical functions -def hist_res(model, subplot=None): + +def hist_res(model, main="Histogram of Residuals", xlab="Residuals", ylab="Density", subplot=None): """ Plots a histogram of the residuals of a fitted statsmodels regression model and overlays a normal distribution curve. Args: model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model. + main (str, optional): Title for the histogram plot. + xlab (str, optional): Label for the x-axis. + ylab (str, optional): Label for the y-axis. + subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created. Returns: None. Displays a histogram of residuals with a normal distribution curve. """ - + # Calculate residuals residuals = model.resid + # If a subplot is specified, create the subplot; otherwise, create a new figure + if subplot: + plt.subplot(*subplot) + else: + plt.figure() + # Plot histogram of the residuals plt.hist(residuals, bins=30, color='blue', alpha=0.7, density=True, label='Residuals Histogram') @@ -32,15 +42,13 @@ def hist_res(model, subplot=None): # Plot the normal distribution curve plt.plot(x, p, 'k', linewidth=2, label='Normal Distribution') - # Update the title - plt.title('Histogram of Residuals') - - # Add labels and move the legend to the upper left corner - plt.xlabel('Residuals') - plt.ylabel('Density') + # Set title and labels using the new arguments + plt.title(main) + plt.xlabel(xlab) + plt.ylabel(ylab) plt.legend(loc='upper left') - # Show the plot if subplot is not specified + # Show the plot only if no subplot is provided if subplot is None: plt.show() plt.clf() diff --git a/build/lib/pregress/plots/plot_cor.py b/build/lib/pregress/plots/plot_cor.py index 30fb531..7cb7a39 100644 --- a/build/lib/pregress/plots/plot_cor.py +++ b/build/lib/pregress/plots/plot_cor.py @@ -2,32 +2,54 @@ import numpy as np import matplotlib.pyplot as plt import seaborn as sns +from pregress.modeling.parse_formula import parse_formula -def plot_cor(df, main='Correlation Matrix', subplot=None): +def plot_cor(formula, data=None, main='Correlation Matrix', subplot=None, **kwargs): """ - Generates a heatmap for the correlation matrix of a DataFrame. + Generates a heatmap for the correlation matrix of a dataframe. Args: - df (pandas.DataFrame): The DataFrame for which to compute the correlation matrix. + formula (str or pandas.DataFrame): The formula or dataframe for which to compute the correlation matrix. + data (pandas.DataFrame, optional): The dataframe for formula evaluation if a formula is provided. main (str, optional): Main title of the plot. - xlab (str, optional): Label for the x-axis. - ylab (str, optional): Label for the y-axis. + subplot (optional): Subplot for embedding the heatmap. + kwargs: Additional keyword arguments for sns.heatmap() (e.g., annot, cmap, square, vmax, vmin, linewidths, etc.) Returns: None. Displays the heatmap. """ + + if isinstance(formula, pd.DataFrame): + data = formula + formula = None + + if formula is not None: + formula = formula + "+0" + Y_name, X_names, Y_out, X_out = parse_formula(formula, data) + # Combine Y and X data for the correlation matrix + data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1) + # Calculate the correlation matrix - corr_matrix = df.corr() + corr_matrix = data.corr() # Set the diagonal elements to NaN to make them white np.fill_diagonal(corr_matrix.values, np.nan) - # Create a custom colormap with black for NaN values - cmap = sns.color_palette("coolwarm", as_cmap=True) - cmap.set_bad(color='black') + # Set default values if not already provided in kwargs + kwargs.setdefault('annot', True) + kwargs.setdefault('square', True) + kwargs.setdefault('vmax', 1) + kwargs.setdefault('vmin', -1) + kwargs.setdefault('linewidths', 0.5) + + # If cmap is not provided in kwargs, set a default cmap with NaN handling + if 'cmap' not in kwargs: + cmap = sns.color_palette("coolwarm", as_cmap=True) + cmap.set_bad(color='black') # Make NaN values appear in black + kwargs['cmap'] = cmap - # Draw the heatmap - sns.heatmap(corr_matrix, annot=True, cmap=cmap, vmax=1, vmin=-1, square=True, linewidths=.5) + # Draw the heatmap, passing in all kwargs dynamically + sns.heatmap(corr_matrix, **kwargs) # Add main title plt.title(main, fontsize=18) diff --git a/build/lib/pregress/plots/plot_res.py b/build/lib/pregress/plots/plot_res.py index dfd5e16..cc14ed3 100644 --- a/build/lib/pregress/plots/plot_res.py +++ b/build/lib/pregress/plots/plot_res.py @@ -2,37 +2,40 @@ import numpy as np import scipy.stats as stats + def plot_res(model, subplot=None): """ Plots the residuals of a fitted statsmodels regression model. Args: model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model. + subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created. Returns: None. Displays a residual plot. """ - + # Calculate residuals residuals = model.resid # Calculate fitted values fitted = model.predict() + # If a subplot is specified, create the subplot; otherwise, create a new figure + if subplot: + plt.subplot(*subplot) + else: + plt.figure() + # Create the residual plot plt.scatter(fitted, residuals, color='blue') plt.axhline(0, color='red', linestyle='--') # Adds a horizontal line at zero plt.xlabel('Fitted values') plt.ylabel('Residuals') plt.title('Residual Plot') - - # Show the plot if subplot is not specified + + # Show the plot only if no subplot is provided if subplot is None: plt.show() plt.clf() plt.close() - - - - - diff --git a/dist/.DS_Store b/dist/.DS_Store deleted file mode 100644 index 5008ddf..0000000 Binary files a/dist/.DS_Store and /dev/null differ diff --git a/dist/pregress-1.0.2.tar.gz b/dist/pregress-1.0.2.tar.gz deleted file mode 100644 index 98c3c68..0000000 Binary files a/dist/pregress-1.0.2.tar.gz and /dev/null differ diff --git a/dist/pregress-1.0.2-py3-none-any.whl b/dist/pregress-1.0.4-py3-none-any.whl similarity index 90% rename from dist/pregress-1.0.2-py3-none-any.whl rename to dist/pregress-1.0.4-py3-none-any.whl index 15ce070..4cd2e17 100644 Binary files a/dist/pregress-1.0.2-py3-none-any.whl and b/dist/pregress-1.0.4-py3-none-any.whl differ diff --git a/dist/pregress-1.0.4.tar.gz b/dist/pregress-1.0.4.tar.gz new file mode 100644 index 0000000..486d551 Binary files /dev/null and b/dist/pregress-1.0.4.tar.gz differ diff --git a/pregress.egg-info/PKG-INFO b/pregress.egg-info/PKG-INFO index 7884716..60324fc 100644 --- a/pregress.egg-info/PKG-INFO +++ b/pregress.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: pregress -Version: 1.0.2 +Version: 1.0.4 Summary: Python Regression Analysis. Home-page: https://github.com/danmcgib/pregress Author: Daniel McGibney diff --git a/pregress/.DS_Store b/pregress/.DS_Store index 0738cae..309b31b 100644 Binary files a/pregress/.DS_Store and b/pregress/.DS_Store differ diff --git a/pregress/__pycache__/__init__.cpython-311.pyc b/pregress/__pycache__/__init__.cpython-311.pyc index 06e322c..f06eaa4 100644 Binary files a/pregress/__pycache__/__init__.cpython-311.pyc and b/pregress/__pycache__/__init__.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/__init__.cpython-311.pyc b/pregress/modeling/__pycache__/__init__.cpython-311.pyc index 1b4a4c4..758ed59 100644 Binary files a/pregress/modeling/__pycache__/__init__.cpython-311.pyc and b/pregress/modeling/__pycache__/__init__.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc b/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc index b2cee21..c3eefb8 100644 Binary files a/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc and b/pregress/modeling/__pycache__/add_explicit_variable.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc b/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc index 13e81d1..241aef0 100644 Binary files a/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc and b/pregress/modeling/__pycache__/apply_transformation.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/box_cox.cpython-310.pyc b/pregress/modeling/__pycache__/box_cox.cpython-310.pyc index 6c59550..f7e1601 100644 Binary files a/pregress/modeling/__pycache__/box_cox.cpython-310.pyc and b/pregress/modeling/__pycache__/box_cox.cpython-310.pyc differ diff --git a/pregress/modeling/__pycache__/box_cox.cpython-311.pyc b/pregress/modeling/__pycache__/box_cox.cpython-311.pyc new file mode 100644 index 0000000..dd14eec Binary files /dev/null and b/pregress/modeling/__pycache__/box_cox.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/bp_test.cpython-311.pyc b/pregress/modeling/__pycache__/bp_test.cpython-311.pyc new file mode 100644 index 0000000..0bdc902 Binary files /dev/null and b/pregress/modeling/__pycache__/bp_test.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/bsr.cpython-311.pyc b/pregress/modeling/__pycache__/bsr.cpython-311.pyc new file mode 100644 index 0000000..8bf6242 Binary files /dev/null and b/pregress/modeling/__pycache__/bsr.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc b/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc index ec0e1a7..7ff68f0 100644 Binary files a/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc and b/pregress/modeling/__pycache__/extract_variable.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/fit.cpython-310.pyc b/pregress/modeling/__pycache__/fit.cpython-310.pyc index a579c79..39ef806 100644 Binary files a/pregress/modeling/__pycache__/fit.cpython-310.pyc and b/pregress/modeling/__pycache__/fit.cpython-310.pyc differ diff --git a/pregress/modeling/__pycache__/fit.cpython-311.pyc b/pregress/modeling/__pycache__/fit.cpython-311.pyc index 40f8a6d..b4a4470 100644 Binary files a/pregress/modeling/__pycache__/fit.cpython-311.pyc and b/pregress/modeling/__pycache__/fit.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/intervals.cpython-311.pyc b/pregress/modeling/__pycache__/intervals.cpython-311.pyc new file mode 100644 index 0000000..da1f725 Binary files /dev/null and b/pregress/modeling/__pycache__/intervals.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc b/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc index 3349b56..5189347 100644 Binary files a/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc and b/pregress/modeling/__pycache__/parse_formula.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/predict.cpython-311.pyc b/pregress/modeling/__pycache__/predict.cpython-311.pyc index 70f00a5..f9f11d1 100644 Binary files a/pregress/modeling/__pycache__/predict.cpython-311.pyc and b/pregress/modeling/__pycache__/predict.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/shapiro_test.cpython-311.pyc b/pregress/modeling/__pycache__/shapiro_test.cpython-311.pyc new file mode 100644 index 0000000..3848bce Binary files /dev/null and b/pregress/modeling/__pycache__/shapiro_test.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/step.cpython-311.pyc b/pregress/modeling/__pycache__/step.cpython-311.pyc new file mode 100644 index 0000000..0fe6838 Binary files /dev/null and b/pregress/modeling/__pycache__/step.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/summary.cpython-310.pyc b/pregress/modeling/__pycache__/summary.cpython-310.pyc index ed4d0b7..44822f5 100644 Binary files a/pregress/modeling/__pycache__/summary.cpython-310.pyc and b/pregress/modeling/__pycache__/summary.cpython-310.pyc differ diff --git a/pregress/modeling/__pycache__/summary.cpython-311.pyc b/pregress/modeling/__pycache__/summary.cpython-311.pyc index 1daf53b..f8f9edd 100644 Binary files a/pregress/modeling/__pycache__/summary.cpython-311.pyc and b/pregress/modeling/__pycache__/summary.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/vif.cpython-311.pyc b/pregress/modeling/__pycache__/vif.cpython-311.pyc new file mode 100644 index 0000000..922cc40 Binary files /dev/null and b/pregress/modeling/__pycache__/vif.cpython-311.pyc differ diff --git a/pregress/modeling/__pycache__/xy_split.cpython-311.pyc b/pregress/modeling/__pycache__/xy_split.cpython-311.pyc new file mode 100644 index 0000000..6163e37 Binary files /dev/null and b/pregress/modeling/__pycache__/xy_split.cpython-311.pyc differ diff --git a/pregress/modeling/box_cox.py b/pregress/modeling/box_cox.py index 5ce1bd5..c41788f 100644 --- a/pregress/modeling/box_cox.py +++ b/pregress/modeling/box_cox.py @@ -1,44 +1,49 @@ import numpy as np import matplotlib.pyplot as plt from scipy.stats import boxcox, boxcox_llf -import statsmodels.api as sm -import statsmodels.formula.api as smf def box_cox(model): - """ - Perform a Box-Cox transformation on the response variable of a given statsmodels regression results object, - output a plot of the log-likelihood as a function of lambda, the fitted lambda, and the 95% confidence interval. - - Args: - model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model. - """ - # Extract the response variable y = model.model.endog - - # Perform the Box-Cox transformation + if np.any(y <= 0): + raise ValueError("All values in the response variable must be positive for Box-Cox transformation.") + y_transformed, fitted_lambda = boxcox(y) - - # Calculate the log-likelihood for different lambda values using boxcox_llf - lambdas = np.linspace(-2, 2, 100) + + # Calculate lambdas from -3 to 3 for better CI accuracy + lambdas = np.linspace(-3, 3, 100) log_likelihood = [boxcox_llf(lmbda, y) for lmbda in lambdas] - - # Calculate the 95% confidence interval + + # Plot lambdas from -2.1 to 2.1 + plot_lambdas = lambdas[(lambdas >= -2.1) & (lambdas <= 2.1)] + plot_log_likelihood = [boxcox_llf(lmbda, y) for lmbda in plot_lambdas] + max_log_likelihood = boxcox_llf(fitted_lambda, y) ci_cutoff = max_log_likelihood - 1.92 # Chi-squared distribution cutoff for 95% CI (1 degree of freedom) ci_lambdas = lambdas[np.array(log_likelihood) >= ci_cutoff] - - lambda_lower = ci_lambdas[0] - lambda_upper = ci_lambdas[-1] - - # Plot the log-likelihood as a function of lambda + plt.figure(figsize=(10, 6)) - plt.plot(lambdas, log_likelihood, label='Log-Likelihood') - plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Fitted Lambda: {fitted_lambda:.4f}') - plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}') - plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}') + + # Plot the restricted range of log-likelihood from -2.1 to 2.1 + plt.plot(plot_lambdas, plot_log_likelihood, label='Log-Likelihood Function') + + # Set xlim to focus on the typical range of -2 to 2 + plt.xlim([-2, 2]) + + # Set ylim based exactly on the min and max log-likelihood without additional padding + plt.ylim([min(plot_log_likelihood), max(plot_log_likelihood)+.05* (max(plot_log_likelihood) - min(plot_log_likelihood))]) + + if -2 <= fitted_lambda <= 2: + lambda_lower = ci_lambdas[0] + lambda_upper = ci_lambdas[-1] + plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}') + plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Best Lambda: {fitted_lambda:.4f}') + plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}') + else: + print(f"The fitted_lambda is {fitted_lambda:.4f}, which is outside the typical range of -2 to 2. CI lines not plotted.") + plt.xlabel('Lambda') plt.ylabel('Log-Likelihood') - plt.title('Box-Cox Transformation Log-Likelihood with 95% CI') - plt.legend(loc='upper left') + plt.title('Log-Likelihood for Box-Cox Transformation') + plt.legend(loc='lower right') plt.grid(True) plt.show() diff --git a/pregress/modeling/fit.py b/pregress/modeling/fit.py index a6bc58d..03dde46 100644 --- a/pregress/modeling/fit.py +++ b/pregress/modeling/fit.py @@ -2,7 +2,7 @@ import statsmodels.api as sm import pandas as pd -def fit(formula, data=None, method = "ols", dummies = True): +def fit(formula: str, data: pd.DataFrame = None, method: str = "ols", dummies: bool = True): """ Fits a statistical model based on a specified formula and data. @@ -10,44 +10,69 @@ def fit(formula, data=None, method = "ols", dummies = True): - formula (str): A string representing the statistical formula (e.g., 'Y ~ X1 + X2 - X3'). - data (DataFrame, optional): The dataset containing the variables specified in the formula. - method (str, optional): The method used for fitting the model. Defaults to 'ols' (Ordinary Least Squares). - Other methods can be implemented, such as logistic regression, random forest, etc. + Supported methods: 'ols' for linear regression, 'logistic' for logistic regression. - dummies (bool, optional): A boolean indicating whether to automatically create dummy variables for categorical predictors. Defaults to True. Returns: - - model (statsmodels object): The fitted model object, which can be used for further analysis, such as + - model (statsmodels object): The fitted model object, which can be used for further analysis, such as making predictions or evaluating model performance. Raises: - ValueError: If the input data is empty or the specified variables are not found in the data. + - NotImplementedError: If an unsupported method is specified. Notes: - - The function currently supports OLS (Ordinary Least Squares) regression. Additional methods like logistic - regression, random forest, and k-nearest neighbors can be added as needed. - - The 'parse_formula' function is used to parse the formula and extract the response and predictor variables - from the dataset. - - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator - variables, with the first category dropped to avoid multicollinearity. Additionally, binary variables - (True/False) are converted to numeric (0/1) values. + - The function currently supports OLS (Ordinary Least Squares) and logistic regression. + Additional methods like random forest or k-nearest neighbors could be added as needed. + - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator + variables, with the first category dropped to avoid multicollinearity. Binary variables (True/False) are + converted to numeric (0/1) values. """ - + + def process_dummies(X_out): + """Helper function to handle dummy variables and binary conversions.""" + X_out = pd.get_dummies(X_out, drop_first=True) + + # Convert binary variables (True/False) to numeric (0/1) + binary_columns = X_out.select_dtypes(include=['bool']).columns + X_out[binary_columns] = X_out[binary_columns].astype(int) + return X_out + + def check_response_and_convert(Y_out): + """Convert categorical response variable to dummies if necessary.""" + if not pd.api.types.is_numeric_dtype(Y_out): + Y_out = pd.get_dummies(Y_out, drop_first=True) + if Y_out.shape[1] > 1: + raise ValueError("Response variable was converted to multiple columns, indicating it is multi-class. " + "This function currently supports binary response variables only.") + return Y_out + Y_name, X_names, Y_out, X_out = parse_formula(formula, data) - - if method.lower() == "ols": - if dummies: - - X_out = pd.get_dummies(X_out, drop_first=True) - - # Convert binary variables (True/False) to numeric (0/1) - binary_columns = X_out.select_dtypes(include=['bool']).columns - X_out[binary_columns] = X_out[binary_columns].astype(int) - if X_out.empty: - raise ValueError("The input data is empty or the specified variables are not found in the data.") + # Ensure Y_out is a Series and retains its name + if isinstance(Y_out, (pd.Series, pd.DataFrame)): + Y_out.name = Y_name # Retain the response variable's name + else: + # Convert numpy array to pandas Series and set name + Y_out = pd.Series(Y_out, name=Y_name) + + if X_out.empty: + raise ValueError("The input data is empty or the specified variables are not found in the data.") + if dummies: + X_out = process_dummies(X_out) + + if method.lower() == "ols": model = sm.OLS(Y_out, X_out).fit() -# if method.lower() == "logistic": -# if method.lower() == "rf": -# if method.lower() == "knn": + elif method.lower() == "logistic": + # Process the response variable to ensure it is numeric or binary + Y_out = check_response_and_convert(Y_out) + model = sm.GLM(Y_out, X_out, family=sm.families.Binomial()).fit() + + else: + raise NotImplementedError(f"Method '{method}' is not implemented. Supported methods: 'ols', 'logistic'.") + return model + diff --git a/pregress/modeling/summary.py b/pregress/modeling/summary.py index bd8453f..1965a0b 100644 --- a/pregress/modeling/summary.py +++ b/pregress/modeling/summary.py @@ -1,14 +1,15 @@ -from .format_summary import format_summary -from .print_r_summary import print_r_summary -from .print_anova_table import print_anova_table -from .print_stata_summary import print_stata_summary -from .significance_code import significance_code +from pregress.modeling.format_summary import format_summary +from pregress.modeling.print_r_summary import print_r_summary +from pregress.modeling.print_anova_table import print_anova_table +from pregress.modeling.print_stata_summary import print_stata_summary +from pregress.modeling.significance_code import significance_code import numpy as np import pandas as pd import statsmodels.api as sm import warnings from io import StringIO + def summary(model, out='simple', level=0.95): """ Generates and prints a summary of the regression model fit. The default summary is 'simple', @@ -33,52 +34,140 @@ def summary(model, out='simple', level=0.95): if out in ['statsmodels', 'stats']: print(model.summary(alpha=alpha)) return + + def print_model_type(model): + if isinstance(model, sm.regression.linear_model.RegressionResultsWrapper): + if isinstance(model.model, sm.OLS): + model_type = "ols" + elif isinstance(model.model, sm.Logit): + model_type = "logit" + elif isinstance(model.model, sm.GLM): + if isinstance(model.model.family, sm.families.Binomial): + model_type = "glm" + else: + model_type = "glm_nonlogit" + else: + model_type = "Other Regression Model" + else: + model_type = "Unsupported model type." + + return model_type + + def summary_ols(model, out='simple', level=0.95): + + alpha = round(1 - level, 5) # Ensure alpha is correctly formatted + + warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20") + warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning) + results_summary = model.summary(alpha=alpha) + results_as_html = results_summary.tables[1].as_html() + summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0] + summary_df = format_summary(summary_df, alpha) + + p_values = model.pvalues + conf_intervals = model.conf_int(alpha=alpha) + r_squared = model.rsquared + adj_r_squared = model.rsquared_adj + f_statistic = model.fvalue + f_p_value = model.f_pvalue + log_likelihood = model.llf + aic = model.aic + bic = model.bic + RSS = np.sum(model.resid**2) + df = model.df_resid + RSE = np.sqrt(RSS / df) + n_obs = int(model.nobs) + df_model = model.df_model + df_resid = model.df_resid + mse_model = model.mse_model + mse_resid = model.mse_resid + + if out == 'r': + print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value) + elif out == 'simple': + print("Summary of Regression Analysis:") + print("======================================================") + print("\nCoefficients:") + print("------------------------------------------------------") + print(summary_df) + print("\nModel Statistics:") + print("------------------------------------------------------") + print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}") + print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}") + print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}") + print("======================================================") + elif out in ['coefficients', 'coef']: + print(model.summary(alpha=alpha).tables[1]) + elif out == 'anova': + anova_table = print_anova_table(model) + print(anova_table) + elif out == 'stata': + print_stata_summary(model, summary_df, conf_intervals, level) + else: + raise ValueError("Unsupported summary type specified.") - warnings.filterwarnings("ignore", message="kurtosistest only valid for n>=20") - warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ValueWarning) - results_summary = model.summary(alpha=alpha) - results_as_html = results_summary.tables[1].as_html() - summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0] - summary_df = format_summary(summary_df, alpha) - - p_values = model.pvalues - conf_intervals = model.conf_int(alpha=alpha) - r_squared = model.rsquared - adj_r_squared = model.rsquared_adj - f_statistic = model.fvalue - f_p_value = model.f_pvalue - log_likelihood = model.llf - aic = model.aic - bic = model.bic - RSS = np.sum(model.resid**2) - df = model.df_resid - RSE = np.sqrt(RSS / df) - n_obs = int(model.nobs) - df_model = model.df_model - df_resid = model.df_resid - mse_model = model.mse_model - mse_resid = model.mse_resid - - if out == 'r': - print_r_summary(model, summary_df, RSE, r_squared, adj_r_squared, f_statistic, f_p_value) - elif out == 'simple': - print("Summary of Regression Analysis:") - print("======================================================") - print("\nCoefficients:") - print("------------------------------------------------------") - print(summary_df) - print("\nModel Statistics:") - print("------------------------------------------------------") - print(f"R-squared: {r_squared:.4f} AIC: {aic:.4f}") - print(f"Adj. R-squared: {adj_r_squared:.4f} BIC: {bic:.4f}") - print(f"F-statistic: {f_statistic:.2f} on {int(df_model)} and {int(df_resid)} DF, p-value: {f_p_value:.6f}") - print("======================================================") - elif out in ['coefficients', 'coef']: - print(model.summary(alpha=alpha).tables[1]) - elif out == 'anova': - anova_table = print_anova_table(model) - print(anova_table) - elif out == 'stata': - print_stata_summary(model, summary_df, conf_intervals, level) + def summary_logistic(model, out='simple', level=0.95): + """ + Generates a summary of a logistic regression model using statsmodels. + + Args: + model: The logistic regression model object from statsmodels. + out (str): Type of summary output. Options include 'simple', 'statsmodels', 'R', 'STATA', + 'coefficients', and 'ANOVA' (not typically applicable). + level (float): Confidence level for the confidence intervals. Default is 0.95. + + Returns: + Various types of summaries depending on the input type, both printed and returned. + """ + + # Suppress Warnings + warnings.filterwarnings("ignore", category=FutureWarning, + message="The bic value is computed using the deviance formula.*") + warnings.filterwarnings("ignore", category=sm.tools.sm_exceptions.ConvergenceWarning) + + alpha = round(1 - level, 5) # Ensure alpha is correctly formatted + + results_summary = model.summary(alpha=alpha) + results_as_html = results_summary.tables[1].as_html() + summary_df = pd.read_html(StringIO(results_as_html), header=0, index_col=0)[0] + summary_df = format_summary(summary_df, alpha) + + conf_intervals = model.conf_int(alpha=alpha) + log_likelihood = model.llf + aic = model.aic + bic = model.bic + pseudo_r_squared = model.pseudo_rsquared(kind='cs') + n_obs = int(model.nobs) + + if out == 'r': + print("Not currently available for logistic regression.") + elif out == 'simple': + print("Summary of Logistic Regression Analysis:") + print("======================================================") + print("\nCoefficients (Odds Ratios):") + print("------------------------------------------------------") + print(summary_df) + print("\nModel Statistics:") + print("------------------------------------------------------") + print(f"Log-Likelihood: {log_likelihood:.4f} AIC: {aic:.4f}") + print(f"Pseudo R-squared: {pseudo_r_squared:.4f} BIC: {bic:.4f}") + print("======================================================") + elif out in ['coefficients', 'coef']: + print(results_summary.tables[1]) + elif out == 'anova': + print("ANOVA table not applicable for logistic regression.") + elif out == 'stata': + print("Not currently available for logistic regression.") + else: + raise ValueError("Unsupported summary type specified.") + + return results_summary + + model_type = print_model_type(model) + + if model_type == "ols": + summary_ols(model, out, level) + elif model_type == "glm": + summary_logistic(model, out, level) else: - raise ValueError("Unsupported summary type specified.") + raise ValueError("Unsupported model type.") diff --git a/pregress/plots/__pycache__/__init__.cpython-311.pyc b/pregress/plots/__pycache__/__init__.cpython-311.pyc index 84218b6..52226ce 100644 Binary files a/pregress/plots/__pycache__/__init__.cpython-311.pyc and b/pregress/plots/__pycache__/__init__.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/abline.cpython-311.pyc b/pregress/plots/__pycache__/abline.cpython-311.pyc new file mode 100644 index 0000000..9d3b53c Binary files /dev/null and b/pregress/plots/__pycache__/abline.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/barplot.cpython-310.pyc b/pregress/plots/__pycache__/barplot.cpython-310.pyc index 91341bb..20eda64 100644 Binary files a/pregress/plots/__pycache__/barplot.cpython-310.pyc and b/pregress/plots/__pycache__/barplot.cpython-310.pyc differ diff --git a/pregress/plots/__pycache__/barplot.cpython-311.pyc b/pregress/plots/__pycache__/barplot.cpython-311.pyc index f40b6f8..e6ef137 100644 Binary files a/pregress/plots/__pycache__/barplot.cpython-311.pyc and b/pregress/plots/__pycache__/barplot.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/boxplot.cpython-310.pyc b/pregress/plots/__pycache__/boxplot.cpython-310.pyc index 47f809a..a968b7d 100644 Binary files a/pregress/plots/__pycache__/boxplot.cpython-310.pyc and b/pregress/plots/__pycache__/boxplot.cpython-310.pyc differ diff --git a/pregress/plots/__pycache__/boxplot.cpython-311.pyc b/pregress/plots/__pycache__/boxplot.cpython-311.pyc index 84e26e0..c789cee 100644 Binary files a/pregress/plots/__pycache__/boxplot.cpython-311.pyc and b/pregress/plots/__pycache__/boxplot.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/hist.cpython-310.pyc b/pregress/plots/__pycache__/hist.cpython-310.pyc index 2d65345..6c5e6af 100644 Binary files a/pregress/plots/__pycache__/hist.cpython-310.pyc and b/pregress/plots/__pycache__/hist.cpython-310.pyc differ diff --git a/pregress/plots/__pycache__/hist.cpython-311.pyc b/pregress/plots/__pycache__/hist.cpython-311.pyc index ef71685..d45eae3 100644 Binary files a/pregress/plots/__pycache__/hist.cpython-311.pyc and b/pregress/plots/__pycache__/hist.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/hist_res.cpython-310.pyc b/pregress/plots/__pycache__/hist_res.cpython-310.pyc index 7c020fb..26014c9 100644 Binary files a/pregress/plots/__pycache__/hist_res.cpython-310.pyc and b/pregress/plots/__pycache__/hist_res.cpython-310.pyc differ diff --git a/pregress/plots/__pycache__/hist_res.cpython-311.pyc b/pregress/plots/__pycache__/hist_res.cpython-311.pyc new file mode 100644 index 0000000..6902b12 Binary files /dev/null and b/pregress/plots/__pycache__/hist_res.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/hists.cpython-311.pyc b/pregress/plots/__pycache__/hists.cpython-311.pyc index 8efc5ba..00d69c0 100644 Binary files a/pregress/plots/__pycache__/hists.cpython-311.pyc and b/pregress/plots/__pycache__/hists.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/plot_cook.cpython-311.pyc b/pregress/plots/__pycache__/plot_cook.cpython-311.pyc new file mode 100644 index 0000000..831805b Binary files /dev/null and b/pregress/plots/__pycache__/plot_cook.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/plot_cor.cpython-310.pyc b/pregress/plots/__pycache__/plot_cor.cpython-310.pyc index 3e9bf0d..105936b 100644 Binary files a/pregress/plots/__pycache__/plot_cor.cpython-310.pyc and b/pregress/plots/__pycache__/plot_cor.cpython-310.pyc differ diff --git a/pregress/plots/__pycache__/plot_cor.cpython-311.pyc b/pregress/plots/__pycache__/plot_cor.cpython-311.pyc index b9828a8..935b96f 100644 Binary files a/pregress/plots/__pycache__/plot_cor.cpython-311.pyc and b/pregress/plots/__pycache__/plot_cor.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/plot_intervals.cpython-311.pyc b/pregress/plots/__pycache__/plot_intervals.cpython-311.pyc new file mode 100644 index 0000000..b9b2020 Binary files /dev/null and b/pregress/plots/__pycache__/plot_intervals.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/plot_qq.cpython-311.pyc b/pregress/plots/__pycache__/plot_qq.cpython-311.pyc new file mode 100644 index 0000000..7d9e765 Binary files /dev/null and b/pregress/plots/__pycache__/plot_qq.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/plot_res.cpython-310.pyc b/pregress/plots/__pycache__/plot_res.cpython-310.pyc index 58c9355..3a26610 100644 Binary files a/pregress/plots/__pycache__/plot_res.cpython-310.pyc and b/pregress/plots/__pycache__/plot_res.cpython-310.pyc differ diff --git a/pregress/plots/__pycache__/plot_res.cpython-311.pyc b/pregress/plots/__pycache__/plot_res.cpython-311.pyc new file mode 100644 index 0000000..7d72229 Binary files /dev/null and b/pregress/plots/__pycache__/plot_res.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/plot_xy.cpython-311.pyc b/pregress/plots/__pycache__/plot_xy.cpython-311.pyc new file mode 100644 index 0000000..5af922e Binary files /dev/null and b/pregress/plots/__pycache__/plot_xy.cpython-311.pyc differ diff --git a/pregress/plots/__pycache__/plots.cpython-311.pyc b/pregress/plots/__pycache__/plots.cpython-311.pyc index 0d79ef1..a28c279 100644 Binary files a/pregress/plots/__pycache__/plots.cpython-311.pyc and b/pregress/plots/__pycache__/plots.cpython-311.pyc differ diff --git a/pregress/plots/barplot.py b/pregress/plots/barplot.py index bd05dd4..e87f39d 100644 --- a/pregress/plots/barplot.py +++ b/pregress/plots/barplot.py @@ -22,6 +22,9 @@ def barplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Barplots Returns: None. The function creates and shows bar plots. """ + if isinstance(formula, pd.DataFrame): + data = formula + formula = None if formula is not None: formula = formula + "+0" Y_name, X_names, Y_out, X_out = parse_formula(formula, data) diff --git a/pregress/plots/boxplot.py b/pregress/plots/boxplot.py index f60f798..8e7f5ed 100644 --- a/pregress/plots/boxplot.py +++ b/pregress/plots/boxplot.py @@ -9,10 +9,10 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots Generates and prints boxplots for all numeric variables specified in the formula or all numeric variables in the data if no formula is provided. Args: - formula (str, optional): Formula to define the model (dependent ~ independent). + formula (str, optional): Formula to define the model (Y ~ X). data (DataFrame, optional): Data frame containing the data. - xcolor (str, optional): Color of the boxplots for the independent variables. - ycolor (str, optional): Color of the boxplots for the dependent variable. + xcolor (str, optional): Color of the boxplots for the predictor variables. + ycolor (str, optional): Color of the boxplots for the response variable. main (str, optional): Title of the plot. xlab (str, optional): Label for the x-axis. ylab (str, optional): Label for the y-axis. @@ -67,4 +67,4 @@ def boxplot(formula=None, data=None, xcolor="blue", ycolor="red", main="Boxplots if subplot is None: plt.show() plt.clf() - plt.close() \ No newline at end of file + plt.close() diff --git a/pregress/plots/hist.py b/pregress/plots/hist.py index ac8444b..1121639 100644 --- a/pregress/plots/hist.py +++ b/pregress/plots/hist.py @@ -3,8 +3,9 @@ import matplotlib.pyplot as plt import seaborn as sns from scipy.stats import norm as normal_dist +import inspect -def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot = None): +def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, ylab="Frequency", subplot=None): """ Generates and prints a histogram for a given vector. @@ -16,6 +17,7 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, main (str, optional): Title for the histogram. xlab (str, optional): Label for the x-axis. ylab (str, optional): Label for the y-axis. + subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). Returns: None. The function creates and shows the histogram. @@ -31,9 +33,11 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, xlab = [var_name for var_name, var_val in callers_local_vars if var_val is vector] xlab = xlab[0] if xlab else 'Variable' - # Clear any existing plots - plt.clf() - plt.close() + # If a subplot is specified, create a subplot within the grid + if subplot: + plt.subplot(*subplot) + else: + plt.figure() # Create the histogram sns.histplot(vector, bins=bins, kde=False, color=color, edgecolor='black') @@ -51,13 +55,6 @@ def hist(vector, bins=30, color="blue", norm=False, main="Histogram", xlab=None, plt.xlabel(xlab) plt.ylabel(ylab) - # Show the plot if subplot is not specified or if it is the last subplot + # Only show the plot if it's not part of a subplot if subplot is None: plt.show() - plt.clf() - plt.close() - - - - - diff --git a/pregress/plots/hist_res.py b/pregress/plots/hist_res.py index ad96b34..0393dee 100644 --- a/pregress/plots/hist_res.py +++ b/pregress/plots/hist_res.py @@ -1,23 +1,33 @@ -from .hist import hist from pregress.modeling.fit import fit import numpy as np import matplotlib.pyplot as plt from scipy import stats # Import for statistical functions -def hist_res(model, subplot=None): + +def hist_res(model, main="Histogram of Residuals", xlab="Residuals", ylab="Density", subplot=None): """ Plots a histogram of the residuals of a fitted statsmodels regression model and overlays a normal distribution curve. Args: model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model. + main (str, optional): Title for the histogram plot. + xlab (str, optional): Label for the x-axis. + ylab (str, optional): Label for the y-axis. + subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created. Returns: None. Displays a histogram of residuals with a normal distribution curve. """ - + # Calculate residuals residuals = model.resid + # If a subplot is specified, create the subplot; otherwise, create a new figure + if subplot: + plt.subplot(*subplot) + else: + plt.figure() + # Plot histogram of the residuals plt.hist(residuals, bins=30, color='blue', alpha=0.7, density=True, label='Residuals Histogram') @@ -32,15 +42,13 @@ def hist_res(model, subplot=None): # Plot the normal distribution curve plt.plot(x, p, 'k', linewidth=2, label='Normal Distribution') - # Update the title - plt.title('Histogram of Residuals') - - # Add labels and move the legend to the upper left corner - plt.xlabel('Residuals') - plt.ylabel('Density') + # Set title and labels using the new arguments + plt.title(main) + plt.xlabel(xlab) + plt.ylabel(ylab) plt.legend(loc='upper left') - # Show the plot if subplot is not specified + # Show the plot only if no subplot is provided if subplot is None: plt.show() plt.clf() diff --git a/pregress/plots/hists.py b/pregress/plots/hists.py index cf08adb..2ab2d88 100644 --- a/pregress/plots/hists.py +++ b/pregress/plots/hists.py @@ -6,27 +6,42 @@ from scipy.stats import norm as normal_dist import warnings -def hists(formula, data=None, bins=30, xcolor="blue", ycolor="red", norm=False, layout="matrix", subplot=None): + +def hists(input_data=None, data=None, bins=30, xcolor="blue", ycolor="red", norm=False, layout="matrix", + main="Distribution of Variables", xlab=None, ylab="Frequency", subplot=None): """ - Generates and prints histograms for all numeric variables specified in the formula. + Generates and prints histograms for all numeric variables specified in the formula or all numeric variables in the DataFrame. Args: - formula (str): Formula to define the model (dependent ~ independent). - data (DataFrame, optional): Data frame containing the data. + input_data (str or DataFrame): Formula to define the model (dependent ~ independent), a single column name, or a DataFrame containing the data. + data (DataFrame, optional): Data frame containing the data if a formula is provided. + bins (int, optional): Number of bins for the histograms. xcolor (str, optional): Color of the histograms for the independent variables. ycolor (str, optional): Color of the histograms for the dependent variable. norm (bool, optional): Whether to include a normal distribution line. layout (str, optional): Layout of the histograms - "column", "row", or "matrix". + main (str, optional): Main title for the plot. + xlab (str, optional): Label for the x-axis. Defaults to each variable name if not provided. + ylab (str, optional): Label for the y-axis. + subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). Returns: None. The function creates and shows histograms. """ - formula = formula + "+0" - Y_name, X_names, Y_out, X_out = parse_formula(formula, data) - - # Combine Y and X data for histograms - plot_data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1) + # Case 1: Handle single variable input without "~" + if isinstance(input_data, str) and '~' not in input_data: + plot_data = pd.DataFrame({input_data: data[input_data]}) + Y_name = None + # Case 2: Directly given DataFrame + elif isinstance(input_data, pd.DataFrame): + plot_data = input_data.select_dtypes(include=[np.number]) + Y_name = None + # Case 3: Formula provided + else: + formula = input_data + "+0" + Y_name, X_names, Y_out, X_out = parse_formula(formula, data) + plot_data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1) # Replace infinite values with NaN plot_data.replace([np.inf, -np.inf], np.nan, inplace=True) @@ -47,6 +62,8 @@ def hists(formula, data=None, bins=30, xcolor="blue", ycolor="red", norm=False, fig, axes = plt.subplots(nrows, ncols, figsize=(5 * ncols, 5 * nrows)) axes = np.array(axes).reshape(-1) # Flatten the axes array for easy iteration + fig.suptitle(main) # Set the main title for the entire figure + with warnings.catch_warnings(): warnings.simplefilter("ignore", category=FutureWarning) @@ -54,22 +71,26 @@ def hists(formula, data=None, bins=30, xcolor="blue", ycolor="red", norm=False, ax = axes[i] color = ycolor if var == Y_name else xcolor sns.histplot(plot_data[var], bins=bins, kde=False, color=color, ax=ax, edgecolor='black') + if norm: mean = plot_data[var].mean() std = plot_data[var].std() x = np.linspace(plot_data[var].min(), plot_data[var].max(), 100) p = normal_dist.pdf(x, mean, std) - ax.plot(x, p * (len(plot_data[var]) * np.diff(np.histogram(plot_data[var], bins=30)[1])[0]), 'k', linewidth=2) + ax.plot(x, p * (len(plot_data[var]) * np.diff(np.histogram(plot_data[var], bins=30)[1])[0]), 'k', + linewidth=2) + + # Set individual titles and labels using provided arguments ax.set_title(f'Histogram of {var}') - ax.set_xlabel(var) - ax.set_ylabel('Frequency') + ax.set_xlabel(xlab if xlab else var) + ax.set_ylabel(ylab) # Remove any unused subplots in the matrix layout for j in range(i + 1, len(axes)): fig.delaxes(axes[j]) - plt.tight_layout() - + plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust layout with space for the main title + # Show the plot if subplot is not specified if subplot is None: plt.show() diff --git a/pregress/plots/plot_cook.py b/pregress/plots/plot_cook.py index 79aa7b7..1388b9a 100644 --- a/pregress/plots/plot_cook.py +++ b/pregress/plots/plot_cook.py @@ -3,13 +3,17 @@ import numpy as np from statsmodels.graphics.gofplots import ProbPlot -def plot_cook(model, threshold=0.5, subplot=None): +def plot_cook(model, threshold=0.5, main="Cook's Distance Plot", xlab="Observation Index", ylab="Cook's Distance", subplot=None): """ Plots Cook's Distance for each observation in a fitted statsmodels regression model to identify influential points. Args: model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model. threshold (float, optional): The threshold for Cook's Distance to highlight influential points. Default is 0.5. + main (str, optional): Title for the plot. + xlab (str, optional): Label for the x-axis. + ylab (str, optional): Label for the y-axis. + subplot (tuple or None, optional): A tuple specifying the subplot grid (nrows, ncols, index) or None to create a new figure. Returns: None. Displays a plot of Cook's Distance for each observation. @@ -18,20 +22,25 @@ def plot_cook(model, threshold=0.5, subplot=None): influence = model.get_influence() cooks_d = influence.cooks_distance[0] + # If a subplot is specified, create the subplot within the given grid; otherwise, create a new figure + if subplot: + plt.subplot(*subplot) + else: + plt.figure(figsize=(8, 6)) + # Create the plot - fig, ax = plt.subplots(figsize=(8, 6)) if subplot is None else subplot + ax = plt.gca() # Get the current axis (either from subplot or new figure) ax.stem(np.arange(len(cooks_d)), cooks_d, markerfmt=",") - ax.set_xlabel('Observation Index') - ax.set_ylabel("Cook's Distance") - ax.set_title("Cook's Distance Plot") + ax.set_xlabel(xlab) + ax.set_ylabel(ylab) + ax.set_title(main) # Adding a reference line for the specified threshold ax.axhline(y=threshold, linestyle='--', color='red', label=f'Influence threshold ({threshold})') ax.legend() - # Show the plot if subplot is not specified + # Show the plot only if no subplot is provided if subplot is None: plt.show() plt.clf() plt.close() - diff --git a/pregress/plots/plot_cor.py b/pregress/plots/plot_cor.py index 30fb531..bc7e3e5 100644 --- a/pregress/plots/plot_cor.py +++ b/pregress/plots/plot_cor.py @@ -2,37 +2,70 @@ import numpy as np import matplotlib.pyplot as plt import seaborn as sns +from pregress.modeling.parse_formula import parse_formula -def plot_cor(df, main='Correlation Matrix', subplot=None): + +def plot_cor(formula, data=None, main='Correlation Matrix', xlab='Variables', ylab='Variables', subplot=None, **kwargs): """ - Generates a heatmap for the correlation matrix of a DataFrame. + Generates a heatmap for the correlation matrix of a dataframe. Args: - df (pandas.DataFrame): The DataFrame for which to compute the correlation matrix. + formula (str or pandas.DataFrame): The formula or dataframe for which to compute the correlation matrix. + data (pandas.DataFrame, optional): The dataframe for formula evaluation if a formula is provided. main (str, optional): Main title of the plot. xlab (str, optional): Label for the x-axis. ylab (str, optional): Label for the y-axis. + subplot (tuple, optional): Subplot for embedding the heatmap (nrows, ncols, index). + kwargs: Additional keyword arguments for sns.heatmap() (e.g., annot, cmap, square, vmax, vmin, linewidths, etc.) Returns: None. Displays the heatmap. """ + + if isinstance(formula, pd.DataFrame): + data = formula + formula = None + + if formula is not None: + formula = formula + "+0" + Y_name, X_names, Y_out, X_out = parse_formula(formula, data) + # Combine Y and X data for the correlation matrix + data = pd.concat([pd.Series(Y_out, name=Y_name), X_out], axis=1) + # Calculate the correlation matrix - corr_matrix = df.corr() + corr_matrix = data.corr() # Set the diagonal elements to NaN to make them white np.fill_diagonal(corr_matrix.values, np.nan) - # Create a custom colormap with black for NaN values - cmap = sns.color_palette("coolwarm", as_cmap=True) - cmap.set_bad(color='black') + # Set default values if not already provided in kwargs + kwargs.setdefault('annot', True) + kwargs.setdefault('square', True) + kwargs.setdefault('vmax', 1) + kwargs.setdefault('vmin', -1) + kwargs.setdefault('linewidths', 0.5) + + # If cmap is not provided in kwargs, set a default cmap with NaN handling + if 'cmap' not in kwargs: + cmap = sns.color_palette("coolwarm", as_cmap=True) + cmap.set_bad(color='black') # Make NaN values appear in black + kwargs['cmap'] = cmap + + # If a subplot is specified, use it; otherwise, create a new figure + if subplot: + plt.subplot(*subplot) + else: + plt.figure(figsize=(8, 6)) - # Draw the heatmap - sns.heatmap(corr_matrix, annot=True, cmap=cmap, vmax=1, vmin=-1, square=True, linewidths=.5) + # Draw the heatmap with specified and default kwargs + sns.heatmap(corr_matrix, **kwargs) - # Add main title + # Set main title, x-axis label, and y-axis label plt.title(main, fontsize=18) + plt.xlabel(xlab) + plt.ylabel(ylab) - # Rotate the tick labels for better readability + # Rotate the tick labels for readability plt.xticks(rotation=45, ha='right') plt.yticks(rotation=0) @@ -40,4 +73,4 @@ def plot_cor(df, main='Correlation Matrix', subplot=None): if subplot is None: plt.show() plt.clf() - plt.close() + plt.close() \ No newline at end of file diff --git a/pregress/plots/plot_res.py b/pregress/plots/plot_res.py index dfd5e16..a01e95c 100644 --- a/pregress/plots/plot_res.py +++ b/pregress/plots/plot_res.py @@ -2,37 +2,45 @@ import numpy as np import scipy.stats as stats -def plot_res(model, subplot=None): + +def plot_res(model, main="Residual Plot", xlab="Fitted values", ylab="Residuals", subplot=None): """ Plots the residuals of a fitted statsmodels regression model. Args: model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model. + main (str, optional): Title for the plot. + xlab (str, optional): Label for the x-axis. + ylab (str, optional): Label for the y-axis. + subplot (tuple, optional): A tuple specifying the subplot grid (nrows, ncols, index). If None, a new figure is created. Returns: None. Displays a residual plot. """ - + # Calculate residuals residuals = model.resid # Calculate fitted values fitted = model.predict() + # If a subplot is specified, create the subplot; otherwise, create a new figure + if subplot: + plt.subplot(*subplot) + else: + plt.figure() + # Create the residual plot plt.scatter(fitted, residuals, color='blue') plt.axhline(0, color='red', linestyle='--') # Adds a horizontal line at zero - plt.xlabel('Fitted values') - plt.ylabel('Residuals') - plt.title('Residual Plot') - - # Show the plot if subplot is not specified + + # Setting the title and labels using provided arguments + plt.xlabel(xlab) + plt.ylabel(ylab) + plt.title(main) + + # Show the plot only if no subplot is provided if subplot is None: plt.show() plt.clf() plt.close() - - - - - diff --git a/pregress/plots/plots.py b/pregress/plots/plots.py index 6dabd01..0648fa0 100644 --- a/pregress/plots/plots.py +++ b/pregress/plots/plots.py @@ -4,19 +4,21 @@ import matplotlib.pyplot as plt import pandas as pd -def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescolor = "black", subplot=None): +def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescolor="black", main="Scatter Plot Matrix"): """ - Generates and prints a plot of each scatter plot corresponding to all X and Y values. + Generates and displays a scatter plot matrix corresponding to all X and Y values. Args: formula (str): Formula to define the model (dependent ~ independent). data (DataFrame, optional): Data frame containing the data. xcolor (str, optional): Color of the points in the scatter plot among both x variables. ycolor (str, optional): Color of the points in the scatter plot including the y variable. - lines (bool, optional): Whether or not to include the regression line in each plot. + lines (bool, optional): Whether to include the regression line in each plot. + linescolor (str, optional): Color of the regression lines. + main (str, optional): Main title of the scatter plot matrix. Returns: - None. The function creates and shows a plot. + None. The function creates and shows the plot. """ # Clear any existing plots plt.clf() @@ -39,6 +41,9 @@ def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescol # Create the pairplot pair_plot = sns.pairplot(plot_data, diag_kind="kde") + # Set main title + plt.suptitle(main, fontsize=18) + # Customizing scatter plot colors for i in range(len(plot_data.columns)): for j in range(len(plot_data.columns)): @@ -72,14 +77,7 @@ def plots(formula, data=None, xcolor="blue", ycolor="red", lines=False, linescol ax=pair_plot.axes[i, j], scatter_kws={'color': xcolor}, line_kws={'color': linescolor}, ci=None, truncate=False) - # Show the plot if subplot is not specified - if subplot is None: - plt.show() - plt.clf() - plt.close() - - - - - - + # Display the plot + plt.show() + plt.clf() + plt.close() diff --git a/setup.cfg b/setup.cfg index fc04ed7..7f3e404 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = pregress -version = 1.0.2 +version = 1.0.4 author = Daniel McGibney author_email = dmcgibney@bus.miami.edu description = Python Regression Analysis. diff --git a/setup.py b/setup.py index 3007461..352a7cc 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ setup( name='pregress', - version='1.0.2', + version='1.0.4', packages=find_packages(include=['pregress', 'pregress.*']), install_requires=[ 'matplotlib', 'pandas', 'numpy', 'statsmodels', 'seaborn', 'scikit-learn',