danmcgib · danmcgib · Sep 23, 2024 · Sep 23, 2024 · Sep 24, 2024 · Oct 8, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/PRegress_Repo.iml b/.idea/PRegress_Repo.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/build/lib/pregress/modeling/box_cox.py b/build/lib/pregress/modeling/box_cox.py
@@ -1,44 +1,49 @@
 import numpy as np
 import matplotlib.pyplot as plt
 from scipy.stats import boxcox, boxcox_llf
-import statsmodels.api as sm
-import statsmodels.formula.api as smf
 
 def box_cox(model):
-    """
-    Perform a Box-Cox transformation on the response variable of a given statsmodels regression results object,
-    output a plot of the log-likelihood as a function of lambda, the fitted lambda, and the 95% confidence interval.
-
-    Args:
-        model (statsmodels.regression.linear_model.RegressionResultsWrapper): A fitted statsmodels regression model.
-    """
-    # Extract the response variable
     y = model.model.endog
-
-    # Perform the Box-Cox transformation
+    if np.any(y <= 0):
+        raise ValueError("All values in the response variable must be positive for Box-Cox transformation.")
+
     y_transformed, fitted_lambda = boxcox(y)
-    
-    # Calculate the log-likelihood for different lambda values using boxcox_llf
-    lambdas = np.linspace(-2, 2, 100)
+
+    # Calculate lambdas from -3 to 3 for better CI accuracy
+    lambdas = np.linspace(-3, 3, 100)
     log_likelihood = [boxcox_llf(lmbda, y) for lmbda in lambdas]
-
-    # Calculate the 95% confidence interval
+
+    # Plot lambdas from -2.1 to 2.1
+    plot_lambdas = lambdas[(lambdas >= -2.1) & (lambdas <= 2.1)]
+    plot_log_likelihood = [boxcox_llf(lmbda, y) for lmbda in plot_lambdas]
+
     max_log_likelihood = boxcox_llf(fitted_lambda, y)
     ci_cutoff = max_log_likelihood - 1.92  # Chi-squared distribution cutoff for 95% CI (1 degree of freedom)
     ci_lambdas = lambdas[np.array(log_likelihood) >= ci_cutoff]
-
-    lambda_lower = ci_lambdas[0]
-    lambda_upper = ci_lambdas[-1]
-
-    # Plot the log-likelihood as a function of lambda
+
     plt.figure(figsize=(10, 6))
-    plt.plot(lambdas, log_likelihood, label='Log-Likelihood')
-    plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Fitted Lambda: {fitted_lambda:.4f}')
-    plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
-    plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')
+
+    # Plot the restricted range of log-likelihood from -2.1 to 2.1
+    plt.plot(plot_lambdas, plot_log_likelihood, label='Log-Likelihood Function')
+
+    # Set xlim to focus on the typical range of -2 to 2
+    plt.xlim([-2, 2])
+
+    # Set ylim based exactly on the min and max log-likelihood without additional padding
+    plt.ylim([min(plot_log_likelihood), max(plot_log_likelihood)+.05* (max(plot_log_likelihood) - min(plot_log_likelihood))])
+
+    if -2 <= fitted_lambda <= 2:
+        lambda_lower = ci_lambdas[0]
+        lambda_upper = ci_lambdas[-1]
+        plt.axvline(lambda_lower, color='b', linestyle='--', label=f'95% CI Lower: {lambda_lower:.4f}')
+        plt.axvline(fitted_lambda, color='r', linestyle='--', label=f'Best Lambda: {fitted_lambda:.4f}')
+        plt.axvline(lambda_upper, color='b', linestyle='--', label=f'95% CI Upper: {lambda_upper:.4f}')
+    else:
+        print(f"The fitted_lambda is {fitted_lambda:.4f}, which is outside the typical range of -2 to 2. CI lines not plotted.")
+
     plt.xlabel('Lambda')
     plt.ylabel('Log-Likelihood')
-    plt.title('Box-Cox Transformation Log-Likelihood with 95% CI')
-    plt.legend(loc='upper left')
+    plt.title('Log-Likelihood for Box-Cox Transformation')
+    plt.legend(loc='lower right')
     plt.grid(True)
     plt.show()
diff --git a/build/lib/pregress/modeling/fit.py b/build/lib/pregress/modeling/fit.py
@@ -2,52 +2,77 @@
 import statsmodels.api as sm
 import pandas as pd
 
-def fit(formula, data=None, method = "ols", dummies = True):
+def fit(formula: str, data: pd.DataFrame = None, method: str = "ols", dummies: bool = True):
     """
     Fits a statistical model based on a specified formula and data.
 
     Parameters:
     - formula (str): A string representing the statistical formula (e.g., 'Y ~ X1 + X2 - X3').
     - data (DataFrame, optional): The dataset containing the variables specified in the formula.
     - method (str, optional): The method used for fitting the model. Defaults to 'ols' (Ordinary Least Squares).
-                              Other methods can be implemented, such as logistic regression, random forest, etc.
+                              Supported methods: 'ols' for linear regression, 'logistic' for logistic regression.
     - dummies (bool, optional): A boolean indicating whether to automatically create dummy variables for categorical
                                 predictors. Defaults to True.
 
     Returns:
-    - model (statsmodels object): The fitted model object, which can be used for further analysis, such as 
+    - model (statsmodels object): The fitted model object, which can be used for further analysis, such as
                                   making predictions or evaluating model performance.
 
     Raises:
     - ValueError: If the input data is empty or the specified variables are not found in the data.
+    - NotImplementedError: If an unsupported method is specified.
 
     Notes:
-    - The function currently supports OLS (Ordinary Least Squares) regression. Additional methods like logistic 
-      regression, random forest, and k-nearest neighbors can be added as needed.
-    - The 'parse_formula' function is used to parse the formula and extract the response and predictor variables 
-      from the dataset.
-    - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator 
-      variables, with the first category dropped to avoid multicollinearity. Additionally, binary variables 
-      (True/False) are converted to numeric (0/1) values.
+    - The function currently supports OLS (Ordinary Least Squares) and logistic regression.
+      Additional methods like random forest or k-nearest neighbors could be added as needed.
+    - If 'dummies' is set to True, categorical variables in the predictors are converted into dummy/indicator
+      variables, with the first category dropped to avoid multicollinearity. Binary variables (True/False) are
+      converted to numeric (0/1) values.
     """
-
+
+    def process_dummies(X_out):
+        """Helper function to handle dummy variables and binary conversions."""
+        X_out = pd.get_dummies(X_out, drop_first=True)
+
+        # Convert binary variables (True/False) to numeric (0/1)
+        binary_columns = X_out.select_dtypes(include=['bool']).columns
+        X_out[binary_columns] = X_out[binary_columns].astype(int)
+        return X_out
+
+    def check_response_and_convert(Y_out):
+        """Convert categorical response variable to dummies if necessary."""
+        if not pd.api.types.is_numeric_dtype(Y_out):
+            Y_out = pd.get_dummies(Y_out, drop_first=True)
+            if Y_out.shape[1] > 1:
+                raise ValueError("Response variable was converted to multiple columns, indicating it is multi-class. "
+                                 "This function currently supports binary response variables only.")
+        return Y_out
+
     Y_name, X_names, Y_out, X_out = parse_formula(formula, data)
-
-    if method.lower() == "ols":
-        if dummies:
-
-            X_out = pd.get_dummies(X_out, drop_first=True)
-
-            # Convert binary variables (True/False) to numeric (0/1)
-            binary_columns = X_out.select_dtypes(include=['bool']).columns
-            X_out[binary_columns] = X_out[binary_columns].astype(int)
 
-        if X_out.empty:
-            raise ValueError("The input data is empty or the specified variables are not found in the data.")
+    # Ensure Y_out is a Series and retains its name
+    if isinstance(Y_out, (pd.Series, pd.DataFrame)):
+        Y_out.name = Y_name  # Retain the response variable's name
+    else:
+        # Convert numpy array to pandas Series and set name
+        Y_out = pd.Series(Y_out, name=Y_name)
+
+    if X_out.empty:
+        raise ValueError("The input data is empty or the specified variables are not found in the data.")
 
+    if dummies:
+        X_out = process_dummies(X_out)
+
+    if method.lower() == "ols":
         model = sm.OLS(Y_out, X_out).fit()
 
-#    if method.lower() == "logistic":
-#    if method.lower() == "rf":
-#    if method.lower() == "knn":
+    elif method.lower() == "logistic":
+        # Process the response variable to ensure it is numeric or binary
+        Y_out = check_response_and_convert(Y_out)
+        model = sm.GLM(Y_out, X_out, family=sm.families.Binomial()).fit()
+
+    else:
+        raise NotImplementedError(f"Method '{method}' is not implemented. Supported methods: 'ols', 'logistic'.")
+
     return model
+