Skip to content

Commit

Permalink
Working versions of RF, MLP, LR, KNN
Browse files Browse the repository at this point in the history
  • Loading branch information
shalinis602 committed Jul 14, 2024
1 parent ff3dc74 commit a5c0447
Show file tree
Hide file tree
Showing 11 changed files with 318 additions and 2 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
*.pkl
*.csv
*.out
*.txt
*.png
!requirements.txt
venv/
17 changes: 15 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
imbalanced_learn==0.12.3
cycler==0.12.1
fonttools==4.53.1
imbalanced-learn==0.12.3
joblib==1.4.2
kiwisolver==1.4.5
lightgbm==4.4.0
matplotlib==3.5.1
numpy==1.21.5
packaging==24.1
pandas==1.4.2
scikit_learn==1.0.2
pillow==10.4.0
pyparsing==3.1.2
python-dateutil==2.9.0.post0
pytz==2024.1
scikit-learn==1.0.2
scipy==1.10.1
six==1.16.0
threadpoolctl==3.5.0
21 changes: 21 additions & 0 deletions results/mlp.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,24 @@ Validation Confusion Matrix after tuning:
[ 1 0 0 12 0]
[ 0 0 0 0 21]]

Test Accuracy: 1.00
Test Classification Report:
precision recall f1-score support

BRCA 1.00 1.00 1.00 27
COAD 1.00 1.00 1.00 8
KIRC 1.00 1.00 1.00 15
LUAD 1.00 1.00 1.00 19
PRAD 1.00 1.00 1.00 11

accuracy 1.00 80
macro avg 1.00 1.00 1.00 80
weighted avg 1.00 1.00 1.00 80

Test Confusion Matrix:
[[27 0 0 0 0]
[ 0 8 0 0 0]
[ 0 0 15 0 0]
[ 0 0 0 19 0]
[ 0 0 0 0 11]]

Binary file modified results/pre_processing/explained_variance_plot.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added src/models/__pycache__/lightgbm.cpython-39.pyc
Binary file not shown.
81 changes: 81 additions & 0 deletions src/models/knn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
import sys, datetime
import pickle
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Generate a timestamp for this run
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = f"logs/knn_{timestamp}.out"

# Redirect stdout and stderr to the log file
sys.stdout = open(f'{log_file}', 'a')
sys.stderr = open(f'{log_file}', 'a')

def pickle_deserialize_object(filename):
with open(filename, 'rb') as f:
return pickle.load(f)

def pickle_serialize_object(filename, obj):
with open(filename, 'wb') as f:
pickle.dump(obj, f)

def main():
# Deserialize the input
input_dir = 'data/processed'
X_train_pca = pickle_deserialize_object(os.path.join(input_dir, 'X_train_pca.pkl'))
y_train_resampled = pickle_deserialize_object(os.path.join(input_dir, 'y_train_resampled.pkl'))

input_dir2 = 'data/processed/transformed'
X_val_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_val_pca.pkl'))
X_test_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_test_pca.pkl'))

input_dir3 = 'data/processed/split_data'
y_val = pickle_deserialize_object(os.path.join(input_dir3, 'y_val.pkl'))
y_test = pickle_deserialize_object(os.path.join(input_dir3, 'y_test.pkl'))

# Define parameter grid for GridSearchCV
param_grid = {
'n_neighbors': [3, 5, 7, 9],
'weights': ['uniform', 'distance'],
'p': [1, 2] # 1 for Manhattan distance, 2 for Euclidean distance
}

# Initialize and fit KNeighborsClassifier with GridSearchCV
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_pca, y_train_resampled)

# Get the best estimator
best_knn = grid_search.best_estimator_

# Evaluate on validation set
y_val_pred = best_knn.predict(X_val_pca)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)

# Evaluate on test set
y_test_pred = best_knn.predict(X_test_pca)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)

# Write results to a file
output_filename = 'results/knn.txt'
with open(output_filename, 'w') as f:
f.write(f"Validation Accuracy: {val_accuracy:.2f}\n")
f.write("Validation Classification Report:\n")
f.write(val_classification_report + '\n')
f.write("Validation Confusion Matrix:\n")
f.write(str(val_confusion_matrix) + '\n\n')

f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
f.write("Test Classification Report:\n")
f.write(test_classification_report + '\n')
f.write("Test Confusion Matrix:\n")
f.write(str(test_confusion_matrix) + '\n')

if __name__ == "__main__":
main()
84 changes: 84 additions & 0 deletions src/models/light_gbm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import os
import sys, datetime
import pickle
import lightgbm as lgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Generate a timestamp for this run
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = f"logs/light_gbm_{timestamp}.out"

# Redirect stdout and stderr to the log file
sys.stdout = open(f'{log_file}', 'a')
sys.stderr = open(f'{log_file}', 'a')

def pickle_deserialize_object(filename):
with open(filename, 'rb') as f:
return pickle.load(f)

def pickle_serialize_object(filename, obj):
with open(filename, 'wb') as f:
pickle.dump(obj, f)

def main():
# Deserialize the input
input_dir = 'data/processed'
X_train_pca = pickle_deserialize_object(os.path.join(input_dir, 'X_train_pca.pkl'))
y_train_resampled = pickle_deserialize_object(os.path.join(input_dir, 'y_train_resampled.pkl'))

input_dir2 = 'data/processed/transformed'
X_val_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_val_pca.pkl'))
X_test_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_test_pca.pkl'))

input_dir3 = 'data/processed/split_data'
y_val = pickle_deserialize_object(os.path.join(input_dir3, 'y_val.pkl'))
y_test = pickle_deserialize_object(os.path.join(input_dir3, 'y_test.pkl'))

# Define parameter grid for GridSearchCV
param_grid = {
'n_estimators': [50, 100],
'learning_rate': [0.01],
'num_leaves': [31],
'max_depth': [10, 20],
'min_child_samples': [100, 200],
'force_col_wise': [True]
}

# Initialize and fit LGBMClassifier with GridSearchCV
lgbm = lgb.LGBMClassifier(random_state=1)
grid_search = GridSearchCV(lgbm, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_pca, y_train_resampled)

# Get the best estimator
best_lgbm = grid_search.best_estimator_

# Evaluate on validation set
y_val_pred = best_lgbm.predict(X_val_pca)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)

# Evaluate on test set
y_test_pred = best_lgbm.predict(X_test_pca)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)

# Write results to a file
output_filename = 'results/lightgbm.txt'
with open(output_filename, 'w') as f:
f.write(f"Validation Accuracy: {val_accuracy:.2f}\n")
f.write("Validation Classification Report:\n")
f.write(val_classification_report + '\n')
f.write("Validation Confusion Matrix:\n")
f.write(str(val_confusion_matrix) + '\n\n')

f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
f.write("Test Classification Report:\n")
f.write(test_classification_report + '\n')
f.write("Test Confusion Matrix:\n")
f.write(str(test_confusion_matrix) + '\n')

if __name__ == "__main__":
main()
Empty file removed src/models/lightgbm.py
Empty file.
82 changes: 82 additions & 0 deletions src/models/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import os
import sys, datetime
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Generate a timestamp for this run
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = f"logs/logistic_regression_{timestamp}.out"

# Redirect stdout and stderr to the log file
sys.stdout = open(f'{log_file}', 'a')
sys.stderr = open(f'{log_file}', 'a')

def pickle_deserialize_object(filename):
with open(filename, 'rb') as f:
return pickle.load(f)

def pickle_serialize_object(filename, obj):
with open(filename, 'wb') as f:
pickle.dump(obj, f)

def main():
# Deserialize the input
input_dir = 'data/processed'
X_train_pca = pickle_deserialize_object(os.path.join(input_dir, 'X_train_pca.pkl'))
y_train_resampled = pickle_deserialize_object(os.path.join(input_dir, 'y_train_resampled.pkl'))

input_dir2 = 'data/processed/transformed'
X_val_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_val_pca.pkl'))
X_test_pca = pickle_deserialize_object(os.path.join(input_dir2, 'X_test_pca.pkl'))

input_dir3 = 'data/processed/split_data'
y_val = pickle_deserialize_object(os.path.join(input_dir3, 'y_val.pkl'))
y_test = pickle_deserialize_object(os.path.join(input_dir3, 'y_test.pkl'))

# Define parameter grid for GridSearchCV
param_grid = {
'penalty': ['l1', 'l2', 'elasticnet', 'none'],
'C': [0.1, 1.0, 10.0],
'solver': ['saga'], # 'saga' supports all penalties including 'elasticnet'
'max_iter': [100, 200, 500]
}

# Initialize and fit LogisticRegression with GridSearchCV
lr = LogisticRegression(random_state=1)
grid_search = GridSearchCV(lr, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train_pca, y_train_resampled)

# Get the best estimator
best_lr = grid_search.best_estimator_

# Evaluate on validation set
y_val_pred = best_lr.predict(X_val_pca)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_classification_report = classification_report(y_val, y_val_pred)
val_confusion_matrix = confusion_matrix(y_val, y_val_pred)

# Evaluate on test set
y_test_pred = best_lr.predict(X_test_pca)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)

# Write results to a file
output_filename = 'results/logistic_regression.txt'
with open(output_filename, 'w') as f:
f.write(f"Validation Accuracy: {val_accuracy:.2f}\n")
f.write("Validation Classification Report:\n")
f.write(val_classification_report + '\n')
f.write("Validation Confusion Matrix:\n")
f.write(str(val_confusion_matrix) + '\n\n')

f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
f.write("Test Classification Report:\n")
f.write(test_classification_report + '\n')
f.write("Test Confusion Matrix:\n")
f.write(str(test_confusion_matrix) + '\n')

if __name__ == "__main__":
main()
22 changes: 22 additions & 0 deletions src/models/mlp.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import os
import sys, datetime
import pickle
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Generate a timestamp for this run
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = f"logs/mlp_{timestamp}.out"

# Redirect stdout and stderr to the log file
sys.stdout = open(f'{log_file}', 'a')
sys.stderr = open(f'{log_file}', 'a')

def pickle_deserialize_object(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
Expand Down Expand Up @@ -63,6 +72,12 @@ def main():
val_classification_report_tuned = classification_report(y_val, y_val_pred_tuned)
val_confusion_matrix_tuned = confusion_matrix(y_val, y_val_pred_tuned)

# Evaluate on test data
y_test_pred = mlp.predict(X_test_pca)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_classification_report = classification_report(y_test, y_test_pred)
test_confusion_matrix = confusion_matrix(y_test, y_test_pred)

# Write results to a file
output_filename = 'results/mlp.txt'
try:
Expand All @@ -80,6 +95,13 @@ def main():
f.write(val_classification_report_tuned + '\n')
f.write("Validation Confusion Matrix after tuning:\n")
f.write(str(val_confusion_matrix_tuned) + '\n\n')

f.write(f"Test Accuracy: {test_accuracy:.2f}\n")
f.write("Test Classification Report:\n")
f.write(test_classification_report + '\n')
f.write("Test Confusion Matrix:\n")
f.write(str(test_confusion_matrix) + '\n\n')

except Exception as e:
print(f"Error writing to file: {e}")

Expand Down
9 changes: 9 additions & 0 deletions src/models/random_forest.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,18 @@
import os
import sys, datetime
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

# Generate a timestamp for this run
timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
log_file = f"logs/random_forest_{timestamp}.out"

# Redirect stdout and stderr to the log file
sys.stdout = open(f'{log_file}', 'a')
sys.stderr = open(f'{log_file}', 'a')

def pickle_deserialize_object(filename):
with open(filename, 'rb') as f:
return pickle.load(f)
Expand Down

0 comments on commit a5c0447

Please sign in to comment.