From 5e6a97ae752d0441dbcbc6caf7cc9122860aa11d Mon Sep 17 00:00:00 2001 From: armaganngul Date: Sun, 1 Dec 2024 14:43:43 -0500 Subject: [PATCH] Refactored some ml_model stuff --- backend/ml_model/repository/demo2.py | 90 ------------------- .../ml_model/repository/safe_grid_search.py | 0 backend/ml_model/repository/safe_split.py | 46 ++++++++++ .../ml_model/repository/safe_train_grid.py | 32 ------- 4 files changed, 46 insertions(+), 122 deletions(-) delete mode 100644 backend/ml_model/repository/demo2.py create mode 100644 backend/ml_model/repository/safe_grid_search.py create mode 100644 backend/ml_model/repository/safe_split.py delete mode 100644 backend/ml_model/repository/safe_train_grid.py diff --git a/backend/ml_model/repository/demo2.py b/backend/ml_model/repository/demo2.py deleted file mode 100644 index a649b807..00000000 --- a/backend/ml_model/repository/demo2.py +++ /dev/null @@ -1,90 +0,0 @@ -import os -import pickle - -from data_preprocessing import DataProcessor -from file_reader import FileReader -from sklearn.model_selection import GridSearchCV, train_test_split -from sklearn.tree import DecisionTreeClassifier - -# Define the dataset path -current_dir = os.path.dirname(os.path.abspath(__file__)) -csv_file_path = os.path.join(current_dir, "single_transaction.csv") - - -def train_and_save_decision_tree_models(): - """ - Train 5 decision tree models with different configurations and save them as .pkl files. - """ - # Load and preprocess data - file_reader = FileReader(csv_file_path) - df_dropped, inputs, target = file_reader.read_file() - - data_processor = DataProcessor(inputs) - inputs_encoded = data_processor.encode_categorical_columns() - inputs_n = data_processor.drop_categorical_columns() - - # Define configurations for five models - model_configs = [ - { - "criterion": "gini", - "max_depth": [3, 5], - "min_samples_split": [2, 3], - "test_size": 0.3, - }, - { - "criterion": "entropy", - "max_depth": [None, 10], - "min_samples_split": [2], - "test_size": 0.25, - }, - { - "criterion": "gini", - "max_depth": [8, 12], - "min_samples_split": [5, 10], - "test_size": 0.2, - }, - { - "criterion": "entropy", - "max_depth": [5], - "min_samples_split": [3, 4], - "test_size": 0.35, - }, - { - "criterion": "gini", - "max_depth": [7, 9], - "min_samples_split": [6, 8], - "test_size": 0.3, - }, - ] - - for i, config in enumerate(model_configs, start=1): - # Split the data - x_train, x_test, y_train, y_test = train_test_split( - inputs_n, target, test_size=config["test_size"], random_state=42 - ) - - # Define the grid search parameters - param_grid = { - "criterion": [config["criterion"]], # Use specific criterion for each model - "max_depth": config["max_depth"], - "min_samples_split": config["min_samples_split"], - } - - # Perform grid search - grid_search = GridSearchCV( - DecisionTreeClassifier(), param_grid, cv=5, scoring="accuracy" - ) - grid_search.fit(x_train, y_train) - best_clf = grid_search.best_estimator_ - score = best_clf.score(x_test, y_test) - - # Save the trained model - model_path = os.path.join(current_dir, f"model_for_demo{i}.pkl") - with open(model_path, "wb") as f: - pickle.dump({"model": best_clf, "score": score}, f) - - print(f"Model {i} saved at: {model_path}") - - -if __name__ == "__main__": - train_and_save_decision_tree_models() diff --git a/backend/ml_model/repository/safe_grid_search.py b/backend/ml_model/repository/safe_grid_search.py new file mode 100644 index 00000000..e69de29b diff --git a/backend/ml_model/repository/safe_split.py b/backend/ml_model/repository/safe_split.py new file mode 100644 index 00000000..4849877a --- /dev/null +++ b/backend/ml_model/repository/safe_split.py @@ -0,0 +1,46 @@ +from sklearn.model_selection import train_test_split +import pandas as pd + + +class SafeSplitter: + """ + A class for safely splitting datasets into training and testing subsets. + + This class ensures that a dataset is properly split while handling cases + where the sample size is too small to perform the split. + """ + + @staticmethod + def train_test_split(inputs: pd.DataFrame, target: pd.Series, test_size=0.2, random_state=48): + """ + Splits the dataset into training and testing subsets safely. + + Parameters: + ----------- + inputs : pd.DataFrame + Feature set of the dataset. + + target : pd.Series + Target labels of the dataset. + + test_size : float, optional (default=0.2) + Proportion of the dataset to include in the test split. + + random_state : int, optional (default=48) + Controls the shuffling applied to the data before splitting. + + Returns: + -------- + tuple or None + Returns a tuple (x_train, x_test, y_train, y_test) if the split is successful. + Returns None if there are not enough samples to split. + """ + try: + x_train, x_test, y_train, y_test = train_test_split( + inputs, target, test_size=test_size, random_state=random_state + ) + return x_train, x_test, y_train, y_test + except ValueError as e: + if "With n_samples=" in str(e): + print("Not enough samples to split. Returning None.") + return None diff --git a/backend/ml_model/repository/safe_train_grid.py b/backend/ml_model/repository/safe_train_grid.py deleted file mode 100644 index 99e11772..00000000 --- a/backend/ml_model/repository/safe_train_grid.py +++ /dev/null @@ -1,32 +0,0 @@ -from sklearn.model_selection import GridSearchCV, train_test_split -from sklearn.tree import DecisionTreeClassifier - - -def safe_train_test_split(inputs, target, test_size=0.2, random_state=48): - try: - x_train, x_test, y_train, y_test = train_test_split( - inputs, target, test_size=test_size, random_state=random_state - ) - return x_train, x_test, y_train, y_test - except ValueError as e: - if "With n_samples=" in str(e): - print("Not enough samples to split. Returning None.") - return None # Returning None when there aren't enough samples - - -def safe_grid_search(x_train, y_train): - try: - # Perform grid search - clf = DecisionTreeClassifier() - param_grid = { - "criterion": ["gini", "entropy"], - "max_depth": [None] + list(range(1, 11)), - "min_samples_split": [2, 5, 10], - } - grid_search = GridSearchCV(clf, param_grid, cv=5, scoring="accuracy") - grid_search.fit(x_train, y_train) - return grid_search.best_estimator_ - except ValueError as e: - if "Cannot have number of splits n_splits" in str(e): - print("Not enough samples for cross-validation. Returning None.") - return None # Returning None when there aren't enough samples