From 8a912fce6687b1eb49135470a5dfa0e23da13a46 Mon Sep 17 00:00:00 2001
From: Harsh Bandhey <raptor419heavy@gmail.com>
Date: Tue, 14 May 2024 11:07:04 -0700
Subject: [PATCH] checked and corrected legacy eda runner

---
 .gitignore                               |  1 +
 requirements.txt                         |  1 +
 streamline/legacy/EDAJobSubmit.py        |  8 ++------
 streamline/runners/dataprocess_runner.py | 14 +++++++++-----
 streamline/utils/parser_helpers.py       |  4 ++--
 test.cfg                                 | 12 ++++++------
 6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/.gitignore b/.gitignore
index 4eae9859..d8a3f1e7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -33,3 +33,4 @@ run_configs/cedars_amd.cfg
 data/AMD_Final/*
 data/PLCO/*
 run_configs/cedars_plco.cfg
+test*.cfg
diff --git a/requirements.txt b/requirements.txt
index c1db4957..5540ecd6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,6 +18,7 @@ xgboost
 lightgbm
 catboost
 gplearn
+group-lasso
 ipython
 fpdf
 scikit-XCS
diff --git a/streamline/legacy/EDAJobSubmit.py b/streamline/legacy/EDAJobSubmit.py
index 22ff4d57..a8326f5e 100644
--- a/streamline/legacy/EDAJobSubmit.py
+++ b/streamline/legacy/EDAJobSubmit.py
@@ -6,18 +6,14 @@
 SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(str(Path(SCRIPT_DIR).parent.parent))
 
-from streamline.dataprep.data_process import DataProcess
-from streamline.dataprep.kfold_partitioning import KFoldPartitioner
 from streamline.utils.dataset import Dataset
-from streamline.utils.parser_helpers import process_cli_param
-
+from streamline.dataprep.data_process import DataProcess
 
 def run_cluster(argv):
     param_path = argv[1]
     with open(param_path, "rb") as input_file:
         params = pickle.load(input_file)
-    params = open(param_path)
-    locals().update(params)
+    globals().update(params)
 
 
     dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type)
diff --git a/streamline/runners/dataprocess_runner.py b/streamline/runners/dataprocess_runner.py
index b37f3f8c..bf40b710 100644
--- a/streamline/runners/dataprocess_runner.py
+++ b/streamline/runners/dataprocess_runner.py
@@ -274,13 +274,17 @@ def save_metadata(self):
     #     return cluster_params
 
     def get_cluster_params(self, dataset_path):
+        extra_kwargs = locals()
+        extra_kwargs.pop('self')
         job_ref = str(time.time())
         params = {}
         for param in dir(self):
-            if not param.startswith("__"):
+            if not (param.startswith("__") or 'bound method' in str(getattr(self, param))):
                 params[param] = getattr(self, param)
-        params[dataset_path] = dataset_path
-        pickle.dump(params, open(self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle', 'wb'))
+        for param in extra_kwargs:
+            params[param] = extra_kwargs[param]
+        with open(self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle', 'wb') as f:
+            pickle.dump(params, f)
         return job_ref
 
 
@@ -302,7 +306,7 @@ def submit_slurm_cluster_job(self, dataset_path):
 
         file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/EDAJobSubmit.py'
         
-        command = ' '.join(['srun', 'python', file_path] + (self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'))
+        command = ' '.join(['srun', 'python', file_path] + [self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'])
         sh_file.write(command + '\n')
         sh_file.close()
         os.system('sbatch ' + job_name)
@@ -324,7 +328,7 @@ def submit_lsf_cluster_job(self, dataset_path):
             '/logs/P1_' + job_ref + '.e\n')
 
         file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/EDAJobSubmit.py'
-        command = ' '.join(['python', file_path] + (self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'))
+        command = ' '.join(['python', file_path] + [self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle'])
         sh_file.write(command + '\n')
         sh_file.close()
         os.system('bsub < ' + job_name)
diff --git a/streamline/utils/parser_helpers.py b/streamline/utils/parser_helpers.py
index 90e7e11c..f570f6ad 100644
--- a/streamline/utils/parser_helpers.py
+++ b/streamline/utils/parser_helpers.py
@@ -51,9 +51,9 @@ def str2bool(v):
 
 def save_config(output_path, experiment_name, config_dict):
     if not os.path.exists(config_dict['output_path']):
-        os.mkdir(str(config_dict['output_path']))
+        os.makedirs(str(config_dict['output_path']))
     if not os.path.exists(str(config_dict['output_path']) + '/' + config_dict['experiment_name']):
-        os.mkdir(str(config_dict['output_path']) + '/' + str(config_dict['experiment_name']))
+        os.makedirs(str(config_dict['output_path']) + '/' + str(config_dict['experiment_name']))
     with open(output_path + '/' + experiment_name + '/runparams.pickle', 'wb') as file:
         pickle.dump(config_dict, file, protocol=pickle.HIGHEST_PROTOCOL)
 
diff --git a/test.cfg b/test.cfg
index e564214d..447a2fbd 100644
--- a/test.cfg
+++ b/test.cfg
@@ -21,10 +21,10 @@ dataset_for_rep = './data/DemoData/hcc_data_custom.csv'
 
 [essential run parameters - phases to run - phases 1-9]
 # If True, automatically runs all phases below up until and including do_report, automatically running 'compare_dataset' only if multiple target datasets included
-do_till_report = True
+do_till_report = False
 
 # Individual phases (do_report and do_rep_report are both part of phase 9)
-do_eda = False
+do_eda = True
 do_dataprep = False
 do_feat_imp = False
 do_feat_sel = False
@@ -32,9 +32,9 @@ do_model = False
 do_stats = False
 do_compare_dataset = False
 do_report = False
-do_replicate = True
-do_rep_report = True
-do_cleanup = True
+do_replicate = False
+do_rep_report = False
+do_cleanup = False
 
 [general - phase 1]
 cv_partitions = 3
@@ -102,7 +102,7 @@ del_old_cv = False
 
 [multiprocessing]
 run_parallel = True
-run_cluster = False
+run_cluster = "SLURMOld"
 reserved_memory = 4
 queue = 'defq'