Working versions of RF, MLP, LR, KNN

shalinis602 · Jul 14, 2024 · a5c0447 · a5c0447
1 parent ff3dc74
commit a5c0447
Show file tree

Hide file tree

Showing 11 changed files with 318 additions and 2 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,7 @@
 *.pkl
 *.csv
+*.out
+*.txt
+*.png
+!requirements.txt
 venv/
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,18 @@
-imbalanced_learn==0.12.3
+cycler==0.12.1
+fonttools==4.53.1
+imbalanced-learn==0.12.3
+joblib==1.4.2
+kiwisolver==1.4.5
+lightgbm==4.4.0
 matplotlib==3.5.1
 numpy==1.21.5
+packaging==24.1
 pandas==1.4.2
-scikit_learn==1.0.2
+pillow==10.4.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+scikit-learn==1.0.2
+scipy==1.10.1
+six==1.16.0
+threadpoolctl==3.5.0
diff --git a/results/mlp.txt b/results/mlp.txt
@@ -41,3 +41,24 @@ Validation Confusion Matrix after tuning:
  [ 1  0  0 12  0]
  [ 0  0  0  0 21]]
 
+Test Accuracy: 1.00
+Test Classification Report:
+              precision    recall  f1-score   support
+
+        BRCA       1.00      1.00      1.00        27
+        COAD       1.00      1.00      1.00         8
+        KIRC       1.00      1.00      1.00        15
+        LUAD       1.00      1.00      1.00        19
+        PRAD       1.00      1.00      1.00        11
+
+    accuracy                           1.00        80
+   macro avg       1.00      1.00      1.00        80
+weighted avg       1.00      1.00      1.00        80
+
+Test Confusion Matrix:
+[[27  0  0  0  0]
+ [ 0  8  0  0  0]
+ [ 0  0 15  0  0]
+ [ 0  0  0 19  0]
+ [ 0  0  0  0 11]]
+
diff --git a/results/pre_processing/explained_variance_plot.png b/results/pre_processing/explained_variance_plot.png
diff --git a/src/models/__pycache__/lightgbm.cpython-39.pyc b/src/models/__pycache__/lightgbm.cpython-39.pyc
diff --git a/src/models/knn.py b/src/models/knn.py
@@ -0,0 +1,81 @@
+import os
+import sys, datetime
+import pickle
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.model_selection import GridSearchCV
+
+# Generate a timestamp for this run
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+log_file = f"logs/knn_{timestamp}.out"
+
+# Redirect stdout and stderr to the log file
+sys.stdout = open(f'{log_file}', 'a')
+sys.stderr = open(f'{log_file}', 'a')
+
+def pickle_deserialize_object(filename):
+    with open(filename, 'rb') as f:
+        return pickle.load(f)
+
+def pickle_serialize_object(filename, obj):
+    with open(filename, 'wb') as f:
+        pickle.dump(obj, f)
+
+def main():
+    # Deserialize the input
+    input_dir = 'data/processed'
+    X_train_pca = pickle_deserialize_object(os.path.join(input_dir, 'X_train_pca.pkl'))
+    y_train_resampled = pickle_deserialize_object(os.path.join(input_dir, 'y_train_resampled.pkl'))    
+
+    input_dir2 = 'data/processed/transformed'
+    X_val_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_val_pca.pkl'))
+    X_test_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_test_pca.pkl'))
+
+    input_dir3 = 'data/processed/split_data'
+    y_val = pickle_deserialize_object(os.path.join(input_dir3, 'y_val.pkl'))
+    y_test = pickle_deserialize_object(os.path.join(input_dir3, 'y_test.pkl'))
+
+    # Define parameter grid for GridSearchCV
+    param_grid = {
+        'n_neighbors': [3, 5, 7, 9],
+        'weights': ['uniform', 'distance'],
+        'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
+    }
+
+    # Initialize and fit KNeighborsClassifier with GridSearchCV
+    knn = KNeighborsClassifier()
+    grid_search = GridSearchCV(knn, param_grid, cv=3, n_jobs=-1, verbose=1)
+    grid_search.fit(X_train_pca, y_train_resampled)
+
+    # Get the best estimator
+    best_knn = grid_search.best_estimator_
+
+    # Evaluate on validation set
+    y_val_pred = best_knn.predict(X_val_pca)
+    val_accuracy = accuracy_score(y_val, y_val_pred)
+    val_classification_report = classification_report(y_val, y_val_pred)
+    val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
+
+    # Evaluate on test set
+    y_test_pred = best_knn.predict(X_test_pca)
+    test_accuracy = accuracy_score(y_test, y_test_pred)
+    test_classification_report = classification_report(y_test, y_test_pred)
+    test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
+
+    # Write results to a file
+    output_filename = 'results/knn.txt'
+    with open(output_filename, 'w') as f:
+        f.write(f"Validation Accuracy: {val_accuracy:.2f}\n")
+        f.write("Validation Classification Report:\n")
+        f.write(val_classification_report + '\n')
+        f.write("Validation Confusion Matrix:\n")
+        f.write(str(val_confusion_matrix) + '\n\n')
+
+        f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
+        f.write("Test Classification Report:\n")
+        f.write(test_classification_report + '\n')
+        f.write("Test Confusion Matrix:\n")
+        f.write(str(test_confusion_matrix) + '\n')
+
+if __name__ == "__main__":
+    main()
diff --git a/src/models/light_gbm.py b/src/models/light_gbm.py
@@ -0,0 +1,84 @@
+import os
+import sys, datetime
+import pickle
+import lightgbm as lgb
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.model_selection import GridSearchCV
+
+# Generate a timestamp for this run
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+log_file = f"logs/light_gbm_{timestamp}.out"
+
+# Redirect stdout and stderr to the log file
+sys.stdout = open(f'{log_file}', 'a')
+sys.stderr = open(f'{log_file}', 'a')
+
+def pickle_deserialize_object(filename):
+    with open(filename, 'rb') as f:
+        return pickle.load(f)
+
+def pickle_serialize_object(filename, obj):
+    with open(filename, 'wb') as f:
+        pickle.dump(obj, f)
+
+def main():
+    # Deserialize the input
+    input_dir = 'data/processed'
+    X_train_pca = pickle_deserialize_object(os.path.join(input_dir, 'X_train_pca.pkl'))
+    y_train_resampled = pickle_deserialize_object(os.path.join(input_dir, 'y_train_resampled.pkl'))    
+
+    input_dir2 = 'data/processed/transformed'
+    X_val_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_val_pca.pkl'))
+    X_test_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_test_pca.pkl'))
+
+    input_dir3 = 'data/processed/split_data'
+    y_val = pickle_deserialize_object(os.path.join(input_dir3, 'y_val.pkl'))
+    y_test = pickle_deserialize_object(os.path.join(input_dir3, 'y_test.pkl'))
+
+    # Define parameter grid for GridSearchCV
+    param_grid = {
+        'n_estimators': [50, 100],
+        'learning_rate': [0.01],
+        'num_leaves': [31],
+        'max_depth': [10, 20],
+        'min_child_samples': [100, 200],
+        'force_col_wise': [True]
+}
+
+    # Initialize and fit LGBMClassifier with GridSearchCV
+    lgbm = lgb.LGBMClassifier(random_state=1)
+    grid_search = GridSearchCV(lgbm, param_grid, cv=3, n_jobs=-1, verbose=1)
+    grid_search.fit(X_train_pca, y_train_resampled)
+
+    # Get the best estimator
+    best_lgbm = grid_search.best_estimator_
+
+    # Evaluate on validation set
+    y_val_pred = best_lgbm.predict(X_val_pca)
+    val_accuracy = accuracy_score(y_val, y_val_pred)
+    val_classification_report = classification_report(y_val, y_val_pred)
+    val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
+
+    # Evaluate on test set
+    y_test_pred = best_lgbm.predict(X_test_pca)
+    test_accuracy = accuracy_score(y_test, y_test_pred)
+    test_classification_report = classification_report(y_test, y_test_pred)
+    test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
+
+    # Write results to a file
+    output_filename = 'results/lightgbm.txt'
+    with open(output_filename, 'w') as f:
+        f.write(f"Validation Accuracy: {val_accuracy:.2f}\n")
+        f.write("Validation Classification Report:\n")
+        f.write(val_classification_report + '\n')
+        f.write("Validation Confusion Matrix:\n")
+        f.write(str(val_confusion_matrix) + '\n\n')
+
+        f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
+        f.write("Test Classification Report:\n")
+        f.write(test_classification_report + '\n')
+        f.write("Test Confusion Matrix:\n")
+        f.write(str(test_confusion_matrix) + '\n')
+
+if __name__ == "__main__":
+    main()
diff --git a/src/models/lightgbm.py b/src/models/lightgbm.py
diff --git a/src/models/logistic_regression.py b/src/models/logistic_regression.py
@@ -0,0 +1,82 @@
+import os
+import sys, datetime
+import pickle
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+from sklearn.model_selection import GridSearchCV
+
+# Generate a timestamp for this run
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+log_file = f"logs/logistic_regression_{timestamp}.out"
+
+# Redirect stdout and stderr to the log file
+sys.stdout = open(f'{log_file}', 'a')
+sys.stderr = open(f'{log_file}', 'a')
+
+def pickle_deserialize_object(filename):
+    with open(filename, 'rb') as f:
+        return pickle.load(f)
+
+def pickle_serialize_object(filename, obj):
+    with open(filename, 'wb') as f:
+        pickle.dump(obj, f)
+
+def main():
+    # Deserialize the input
+    input_dir = 'data/processed'
+    X_train_pca = pickle_deserialize_object(os.path.join(input_dir, 'X_train_pca.pkl'))
+    y_train_resampled = pickle_deserialize_object(os.path.join(input_dir, 'y_train_resampled.pkl'))    
+
+    input_dir2 = 'data/processed/transformed'
+    X_val_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_val_pca.pkl'))
+    X_test_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_test_pca.pkl'))
+
+    input_dir3 = 'data/processed/split_data'
+    y_val = pickle_deserialize_object(os.path.join(input_dir3, 'y_val.pkl'))
+    y_test = pickle_deserialize_object(os.path.join(input_dir3, 'y_test.pkl'))
+
+    # Define parameter grid for GridSearchCV
+    param_grid = {
+        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
+        'C': [0.1, 1.0, 10.0],
+        'solver': ['saga'],  # 'saga' supports all penalties including 'elasticnet'
+        'max_iter': [100, 200, 500]
+    }
+
+    # Initialize and fit LogisticRegression with GridSearchCV
+    lr = LogisticRegression(random_state=1)
+    grid_search = GridSearchCV(lr, param_grid, cv=3, n_jobs=-1, verbose=1)
+    grid_search.fit(X_train_pca, y_train_resampled)
+
+    # Get the best estimator
+    best_lr = grid_search.best_estimator_
+
+    # Evaluate on validation set
+    y_val_pred = best_lr.predict(X_val_pca)
+    val_accuracy = accuracy_score(y_val, y_val_pred)
+    val_classification_report = classification_report(y_val, y_val_pred)
+    val_confusion_matrix = confusion_matrix(y_val, y_val_pred)
+
+    # Evaluate on test set
+    y_test_pred = best_lr.predict(X_test_pca)
+    test_accuracy = accuracy_score(y_test, y_test_pred)
+    test_classification_report = classification_report(y_test, y_test_pred)
+    test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
+
+    # Write results to a file
+    output_filename = 'results/logistic_regression.txt'
+    with open(output_filename, 'w') as f:
+        f.write(f"Validation Accuracy: {val_accuracy:.2f}\n")
+        f.write("Validation Classification Report:\n")
+        f.write(val_classification_report + '\n')
+        f.write("Validation Confusion Matrix:\n")
+        f.write(str(val_confusion_matrix) + '\n\n')
+
+        f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
+        f.write("Test Classification Report:\n")
+        f.write(test_classification_report + '\n')
+        f.write("Test Confusion Matrix:\n")
+        f.write(str(test_confusion_matrix) + '\n')
+
+if __name__ == "__main__":
+    main()
diff --git a/src/models/mlp.py b/src/models/mlp.py
@@ -1,9 +1,18 @@
 import os
+import sys, datetime
 import pickle
 from sklearn.neural_network import MLPClassifier
 from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 from sklearn.model_selection import GridSearchCV
 
+# Generate a timestamp for this run
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+log_file = f"logs/mlp_{timestamp}.out"
+
+# Redirect stdout and stderr to the log file
+sys.stdout = open(f'{log_file}', 'a')
+sys.stderr = open(f'{log_file}', 'a')
+
 def pickle_deserialize_object(filename):
     with open(filename, 'rb') as f:
         return pickle.load(f)
@@ -63,6 +72,12 @@ def main():
     val_classification_report_tuned = classification_report(y_val, y_val_pred_tuned)
     val_confusion_matrix_tuned = confusion_matrix(y_val, y_val_pred_tuned)
 
+    # Evaluate on test data
+    y_test_pred = mlp.predict(X_test_pca)
+    test_accuracy = accuracy_score(y_test, y_test_pred)
+    test_classification_report = classification_report(y_test, y_test_pred)
+    test_confusion_matrix = confusion_matrix(y_test, y_test_pred)
+
     # Write results to a file
     output_filename = 'results/mlp.txt'
     try:
@@ -80,6 +95,13 @@ def main():
             f.write(val_classification_report_tuned + '\n')
             f.write("Validation Confusion Matrix after tuning:\n")
             f.write(str(val_confusion_matrix_tuned) + '\n\n')
+
+            f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
+            f.write("Test Classification Report:\n")
+            f.write(test_classification_report + '\n')
+            f.write("Test Confusion Matrix:\n")
+            f.write(str(test_confusion_matrix) + '\n\n')
+
     except Exception as e:
         print(f"Error writing to file: {e}")
 

diff --git a/src/models/random_forest.py b/src/models/random_forest.py
@@ -1,9 +1,18 @@
 import os
+import sys, datetime
 import pickle
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
 from sklearn.model_selection import GridSearchCV
 
+# Generate a timestamp for this run
+timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+log_file = f"logs/random_forest_{timestamp}.out"
+
+# Redirect stdout and stderr to the log file
+sys.stdout = open(f'{log_file}', 'a')
+sys.stderr = open(f'{log_file}', 'a')
+
 def pickle_deserialize_object(filename):
     with open(filename, 'rb') as f:
         return pickle.load(f)