From 5e6a97ae752d0441dbcbc6caf7cc9122860aa11d Mon Sep 17 00:00:00 2001
From: armaganngul <armagan.gul@mail.utoronto.ca>
Date: Sun, 1 Dec 2024 14:43:43 -0500
Subject: [PATCH] Refactored some ml_model stuff

---
 backend/ml_model/repository/demo2.py          | 90 -------------------
 .../ml_model/repository/safe_grid_search.py   |  0
 backend/ml_model/repository/safe_split.py     | 46 ++++++++++
 .../ml_model/repository/safe_train_grid.py    | 32 -------
 4 files changed, 46 insertions(+), 122 deletions(-)
 delete mode 100644 backend/ml_model/repository/demo2.py
 create mode 100644 backend/ml_model/repository/safe_grid_search.py
 create mode 100644 backend/ml_model/repository/safe_split.py
 delete mode 100644 backend/ml_model/repository/safe_train_grid.py

diff --git a/backend/ml_model/repository/demo2.py b/backend/ml_model/repository/demo2.py
deleted file mode 100644
index a649b807..00000000
--- a/backend/ml_model/repository/demo2.py
+++ /dev/null
@@ -1,90 +0,0 @@
-import os
-import pickle
-
-from data_preprocessing import DataProcessor
-from file_reader import FileReader
-from sklearn.model_selection import GridSearchCV, train_test_split
-from sklearn.tree import DecisionTreeClassifier
-
-# Define the dataset path
-current_dir = os.path.dirname(os.path.abspath(__file__))
-csv_file_path = os.path.join(current_dir, "single_transaction.csv")
-
-
-def train_and_save_decision_tree_models():
-    """
-    Train 5 decision tree models with different configurations and save them as .pkl files.
-    """
-    # Load and preprocess data
-    file_reader = FileReader(csv_file_path)
-    df_dropped, inputs, target = file_reader.read_file()
-
-    data_processor = DataProcessor(inputs)
-    inputs_encoded = data_processor.encode_categorical_columns()
-    inputs_n = data_processor.drop_categorical_columns()
-
-    # Define configurations for five models
-    model_configs = [
-        {
-            "criterion": "gini",
-            "max_depth": [3, 5],
-            "min_samples_split": [2, 3],
-            "test_size": 0.3,
-        },
-        {
-            "criterion": "entropy",
-            "max_depth": [None, 10],
-            "min_samples_split": [2],
-            "test_size": 0.25,
-        },
-        {
-            "criterion": "gini",
-            "max_depth": [8, 12],
-            "min_samples_split": [5, 10],
-            "test_size": 0.2,
-        },
-        {
-            "criterion": "entropy",
-            "max_depth": [5],
-            "min_samples_split": [3, 4],
-            "test_size": 0.35,
-        },
-        {
-            "criterion": "gini",
-            "max_depth": [7, 9],
-            "min_samples_split": [6, 8],
-            "test_size": 0.3,
-        },
-    ]
-
-    for i, config in enumerate(model_configs, start=1):
-        # Split the data
-        x_train, x_test, y_train, y_test = train_test_split(
-            inputs_n, target, test_size=config["test_size"], random_state=42
-        )
-
-        # Define the grid search parameters
-        param_grid = {
-            "criterion": [config["criterion"]],  # Use specific criterion for each model
-            "max_depth": config["max_depth"],
-            "min_samples_split": config["min_samples_split"],
-        }
-
-        # Perform grid search
-        grid_search = GridSearchCV(
-            DecisionTreeClassifier(), param_grid, cv=5, scoring="accuracy"
-        )
-        grid_search.fit(x_train, y_train)
-        best_clf = grid_search.best_estimator_
-        score = best_clf.score(x_test, y_test)
-
-        # Save the trained model
-        model_path = os.path.join(current_dir, f"model_for_demo{i}.pkl")
-        with open(model_path, "wb") as f:
-            pickle.dump({"model": best_clf, "score": score}, f)
-
-        print(f"Model {i} saved at: {model_path}")
-
-
-if __name__ == "__main__":
-    train_and_save_decision_tree_models()
diff --git a/backend/ml_model/repository/safe_grid_search.py b/backend/ml_model/repository/safe_grid_search.py
new file mode 100644
index 00000000..e69de29b
diff --git a/backend/ml_model/repository/safe_split.py b/backend/ml_model/repository/safe_split.py
new file mode 100644
index 00000000..4849877a
--- /dev/null
+++ b/backend/ml_model/repository/safe_split.py
@@ -0,0 +1,46 @@
+from sklearn.model_selection import train_test_split
+import pandas as pd
+
+
+class SafeSplitter:
+    """
+    A class for safely splitting datasets into training and testing subsets.
+
+    This class ensures that a dataset is properly split while handling cases
+    where the sample size is too small to perform the split.
+    """
+
+    @staticmethod
+    def train_test_split(inputs: pd.DataFrame, target: pd.Series, test_size=0.2, random_state=48):
+        """
+        Splits the dataset into training and testing subsets safely.
+
+        Parameters:
+        -----------
+        inputs : pd.DataFrame
+            Feature set of the dataset.
+
+        target : pd.Series
+            Target labels of the dataset.
+
+        test_size : float, optional (default=0.2)
+            Proportion of the dataset to include in the test split.
+
+        random_state : int, optional (default=48)
+            Controls the shuffling applied to the data before splitting.
+
+        Returns:
+        --------
+        tuple or None
+            Returns a tuple (x_train, x_test, y_train, y_test) if the split is successful.
+            Returns None if there are not enough samples to split.
+        """
+        try:
+            x_train, x_test, y_train, y_test = train_test_split(
+                inputs, target, test_size=test_size, random_state=random_state
+            )
+            return x_train, x_test, y_train, y_test
+        except ValueError as e:
+            if "With n_samples=" in str(e):
+                print("Not enough samples to split. Returning None.")
+                return None
diff --git a/backend/ml_model/repository/safe_train_grid.py b/backend/ml_model/repository/safe_train_grid.py
deleted file mode 100644
index 99e11772..00000000
--- a/backend/ml_model/repository/safe_train_grid.py
+++ /dev/null
@@ -1,32 +0,0 @@
-from sklearn.model_selection import GridSearchCV, train_test_split
-from sklearn.tree import DecisionTreeClassifier
-
-
-def safe_train_test_split(inputs, target, test_size=0.2, random_state=48):
-    try:
-        x_train, x_test, y_train, y_test = train_test_split(
-            inputs, target, test_size=test_size, random_state=random_state
-        )
-        return x_train, x_test, y_train, y_test
-    except ValueError as e:
-        if "With n_samples=" in str(e):
-            print("Not enough samples to split. Returning None.")
-            return None  # Returning None when there aren't enough samples
-
-
-def safe_grid_search(x_train, y_train):
-    try:
-        # Perform grid search
-        clf = DecisionTreeClassifier()
-        param_grid = {
-            "criterion": ["gini", "entropy"],
-            "max_depth": [None] + list(range(1, 11)),
-            "min_samples_split": [2, 5, 10],
-        }
-        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring="accuracy")
-        grid_search.fit(x_train, y_train)
-        return grid_search.best_estimator_
-    except ValueError as e:
-        if "Cannot have number of splits n_splits" in str(e):
-            print("Not enough samples for cross-validation. Returning None.")
-            return None  # Returning None when there aren't enough samples