From 8a912fce6687b1eb49135470a5dfa0e23da13a46 Mon Sep 17 00:00:00 2001 From: Harsh Bandhey Date: Tue, 14 May 2024 11:07:04 -0700 Subject: [PATCH] checked and corrected legacy eda runner --- .gitignore | 1 + requirements.txt | 1 + streamline/legacy/EDAJobSubmit.py | 8 ++------ streamline/runners/dataprocess_runner.py | 14 +++++++++----- streamline/utils/parser_helpers.py | 4 ++-- test.cfg | 12 ++++++------ 6 files changed, 21 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 4eae9859..d8a3f1e7 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ run_configs/cedars_amd.cfg data/AMD_Final/* data/PLCO/* run_configs/cedars_plco.cfg +test*.cfg diff --git a/requirements.txt b/requirements.txt index c1db4957..5540ecd6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,6 +18,7 @@ xgboost lightgbm catboost gplearn +group-lasso ipython fpdf scikit-XCS diff --git a/streamline/legacy/EDAJobSubmit.py b/streamline/legacy/EDAJobSubmit.py index 22ff4d57..a8326f5e 100644 --- a/streamline/legacy/EDAJobSubmit.py +++ b/streamline/legacy/EDAJobSubmit.py @@ -6,18 +6,14 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.append(str(Path(SCRIPT_DIR).parent.parent)) -from streamline.dataprep.data_process import DataProcess -from streamline.dataprep.kfold_partitioning import KFoldPartitioner from streamline.utils.dataset import Dataset -from streamline.utils.parser_helpers import process_cli_param - +from streamline.dataprep.data_process import DataProcess def run_cluster(argv): param_path = argv[1] with open(param_path, "rb") as input_file: params = pickle.load(input_file) - params = open(param_path) - locals().update(params) + globals().update(params) dataset = Dataset(dataset_path, outcome_label, match_label, instance_label, outcome_type) diff --git a/streamline/runners/dataprocess_runner.py b/streamline/runners/dataprocess_runner.py index b37f3f8c..bf40b710 100644 --- a/streamline/runners/dataprocess_runner.py +++ b/streamline/runners/dataprocess_runner.py @@ -274,13 +274,17 @@ def save_metadata(self): # return cluster_params def get_cluster_params(self, dataset_path): + extra_kwargs = locals() + extra_kwargs.pop('self') job_ref = str(time.time()) params = {} for param in dir(self): - if not param.startswith("__"): + if not (param.startswith("__") or 'bound method' in str(getattr(self, param))): params[param] = getattr(self, param) - params[dataset_path] = dataset_path - pickle.dump(params, open(self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle', 'wb')) + for param in extra_kwargs: + params[param] = extra_kwargs[param] + with open(self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle', 'wb') as f: + pickle.dump(params, f) return job_ref @@ -302,7 +306,7 @@ def submit_slurm_cluster_job(self, dataset_path): file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/EDAJobSubmit.py' - command = ' '.join(['srun', 'python', file_path] + (self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle')) + command = ' '.join(['srun', 'python', file_path] + [self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle']) sh_file.write(command + '\n') sh_file.close() os.system('sbatch ' + job_name) @@ -324,7 +328,7 @@ def submit_lsf_cluster_job(self, dataset_path): '/logs/P1_' + job_ref + '.e\n') file_path = str(Path(__file__).parent.parent.parent) + "/streamline/legacy" + '/EDAJobSubmit.py' - command = ' '.join(['python', file_path] + (self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle')) + command = ' '.join(['python', file_path] + [self.output_path + '/' + self.experiment_name + '/jobs/P1_' + job_ref + '_params.pickle']) sh_file.write(command + '\n') sh_file.close() os.system('bsub < ' + job_name) diff --git a/streamline/utils/parser_helpers.py b/streamline/utils/parser_helpers.py index 90e7e11c..f570f6ad 100644 --- a/streamline/utils/parser_helpers.py +++ b/streamline/utils/parser_helpers.py @@ -51,9 +51,9 @@ def str2bool(v): def save_config(output_path, experiment_name, config_dict): if not os.path.exists(config_dict['output_path']): - os.mkdir(str(config_dict['output_path'])) + os.makedirs(str(config_dict['output_path'])) if not os.path.exists(str(config_dict['output_path']) + '/' + config_dict['experiment_name']): - os.mkdir(str(config_dict['output_path']) + '/' + str(config_dict['experiment_name'])) + os.makedirs(str(config_dict['output_path']) + '/' + str(config_dict['experiment_name'])) with open(output_path + '/' + experiment_name + '/runparams.pickle', 'wb') as file: pickle.dump(config_dict, file, protocol=pickle.HIGHEST_PROTOCOL) diff --git a/test.cfg b/test.cfg index e564214d..447a2fbd 100644 --- a/test.cfg +++ b/test.cfg @@ -21,10 +21,10 @@ dataset_for_rep = './data/DemoData/hcc_data_custom.csv' [essential run parameters - phases to run - phases 1-9] # If True, automatically runs all phases below up until and including do_report, automatically running 'compare_dataset' only if multiple target datasets included -do_till_report = True +do_till_report = False # Individual phases (do_report and do_rep_report are both part of phase 9) -do_eda = False +do_eda = True do_dataprep = False do_feat_imp = False do_feat_sel = False @@ -32,9 +32,9 @@ do_model = False do_stats = False do_compare_dataset = False do_report = False -do_replicate = True -do_rep_report = True -do_cleanup = True +do_replicate = False +do_rep_report = False +do_cleanup = False [general - phase 1] cv_partitions = 3 @@ -102,7 +102,7 @@ del_old_cv = False [multiprocessing] run_parallel = True -run_cluster = False +run_cluster = "SLURMOld" reserved_memory = 4 queue = 'defq'