-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
executable file
·233 lines (182 loc) · 8.61 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
"""Main module of this repository: Parses the arguments and calls the chosen handler"""
from argparse import ArgumentParser, Namespace
import sys
from handler.debug_datasets import debug_datasets_handler
from handler.combine import combine_handler
from handler.cluster import cluster_handler
from handler.arff import arff_handler
from handler.feat_select import feat_select_handler
from handler.pipeline import pipeline_handler
from handler.best_clustering import best_clustering_handler
from handler.counts import counts_handler
from handler.plot_clustering import plot_clustering_handler
from handler.utils import DEBUG_IDENTIFIER
def add_do_debug_arg(parser: ArgumentParser):
"""Adds the do-debug argument to a parser"""
parser.add_argument(
'--do-debug', required=False, action='store_true',
help='Whether we are currently debugging or not'
)
def add_cohort_arg(parser: ArgumentParser):
"""Adds the cohort argument to a parser"""
parser.add_argument(
'--cohort', type=str, required=True,
help='Whether the cohort is ADNI or ANM; choices: adni, anm'
)
def add_iteration_arg(parser: ArgumentParser):
"""Adds the iteration argument to a parser"""
parser.add_argument(
'--iteration', type=int, required=True,
help='The clustering iteration (i.e. how many clusterings have been made so far)'
)
def add_dataset_arg(parser: ArgumentParser):
"""Adds the target column argument to a parser"""
parser.add_argument(
'--dataset', type=str, required=True,
help='The data set to clean or cluster'
)
def add_cluster_method_arg(parser: ArgumentParser):
"""Adds the cohort argument to a parser"""
parser.add_argument(
'--cluster-method', type=str, required=True,
help='Which clustering method to use; choices: rbf, nearest_neighbors'
)
def add_n_kept_feats_arg(parser: ArgumentParser):
"""Adds the number of kept features argument to the parser"""
parser.add_argument(
'--n-kept-feats', type=int, required=False,
help='The number of features that have been kept. Exclude argument to use all the features'
)
def add_n_clusters_arg(parser: ArgumentParser):
"""Adds the number of clusters argument to the parser"""
parser.add_argument(
'--n-clusters', type=int, required=True,
help='The number of clusters to use'
)
def add_clustering_path_arg(parser: ArgumentParser):
"""Adds the clustering path argument to the parser"""
parser.add_argument(
'--clustering-path', type=str, required=True,
help='path to a clustering.'
)
def add_file_path_args(parser: ArgumentParser):
"""Adds the arguments that are used for constructing file paths to data"""
add_cohort_arg(parser=parser)
add_iteration_arg(parser=parser)
add_dataset_arg(parser=parser)
add_n_kept_feats_arg(parser=parser)
add_n_clusters_arg(parser=parser)
add_cluster_method_arg(parser=parser)
def parse_args(argv) -> Namespace:
"""Gets the arguments for this repo"""
parser: ArgumentParser = ArgumentParser()
subparsers = parser.add_subparsers(dest='handler_type', title='handler_type')
subparsers.required = True
# Configure the debug-datasets handler
debug_datasets_parser: ArgumentParser = subparsers.add_parser('debug-datasets')
add_cohort_arg(parser=debug_datasets_parser)
add_dataset_arg(parser=debug_datasets_parser)
# Configure the combine handler
combine_parser: ArgumentParser = subparsers.add_parser('combine')
add_cohort_arg(parser=combine_parser)
add_dataset_arg(parser=combine_parser)
combine_parser.add_argument(
'--mri-path', type=str, required=True, nargs='?', default=None,
help='path the feature-selected MRI data to combine with the rest of the data. If not set, takes full data set.'
)
add_do_debug_arg(parser=combine_parser)
# Configure the cluster handler
cluster_parser: ArgumentParser = subparsers.add_parser('cluster')
add_file_path_args(parser=cluster_parser)
add_do_debug_arg(parser=cluster_parser)
# Configure the arff handler
arff_parser: ArgumentParser = subparsers.add_parser('arff')
add_file_path_args(parser=arff_parser)
add_do_debug_arg(parser=arff_parser)
arff_parser.add_argument(
'--clustering-score', type=float, required=True,
help='The clustering score of the previous clustering'
)
# Configure the feat-select handler
feat_select_parser: ArgumentParser = subparsers.add_parser('feat-select')
add_file_path_args(parser=feat_select_parser)
add_do_debug_arg(parser=feat_select_parser)
# Configure the pipeline handler
pipeline_parser: ArgumentParser = subparsers.add_parser('pipeline')
add_cohort_arg(parser=pipeline_parser)
add_dataset_arg(parser=pipeline_parser)
add_n_clusters_arg(parser=pipeline_parser)
add_do_debug_arg(parser=pipeline_parser)
add_cluster_method_arg(parser=pipeline_parser)
pipeline_parser.add_argument(
'--n-iterations', type=int, required=True,
help='The number of iterations to reduce the features and re-cluster'
)
pipeline_parser.add_argument(
'--do-continue', required=False, action='store_true',
help='Whether to continue the pipeline from its latest iteration or not'
)
# Configure the best-clustering handler
best_clustering_parser: ArgumentParser = subparsers.add_parser('best-clustering')
add_cohort_arg(parser=best_clustering_parser)
add_dataset_arg(parser=best_clustering_parser)
# Configure the counts handler
counts_parser: ArgumentParser = subparsers.add_parser('counts')
add_clustering_path_arg(parser=counts_parser)
counts_parser.add_argument(
'--feat-map-path', type=str, required=True,
help='path to the mapping from PTID to the variant to count.'
)
# Configure the plot clustering handler
plot_clustering_parser: ArgumentParser = subparsers.add_parser('plot-clustering')
add_clustering_path_arg(parser=plot_clustering_parser)
plot_clustering_parser.add_argument(
'--data-path', type=str, required=True,
help='path to the data for the clustering plot.'
)
args: Namespace = parser.parse_args(argv)
if hasattr(args, 'do_debug') and args.do_debug is True:
assert hasattr(args, 'dataset')
args.dataset = DEBUG_IDENTIFIER + args.dataset
return args
def main(argv: list):
"""Main function of this module"""
args: Namespace = parse_args(argv)
if args.handler_type == 'debug-datasets':
# Create a smaller version (less columns) of a data set for debugging
debug_datasets_handler(cohort=args.cohort, dataset=args.dataset)
elif args.handler_type == 'combine':
# Combine the phenotypes, MRI data, and gene expression data into a single data set
combine_handler(cohort=args.cohort, dataset=args.dataset, mri_path=args.mri_path, do_debug=args.do_debug)
elif args.handler_type == 'cluster':
# Obtain the cluster labels for the ARFF
cluster_handler(
cohort=args.cohort, dataset=args.dataset, cluster_method=args.cluster_method, n_clusters=args.n_clusters,
iteration=args.iteration, n_kept_feats=args.n_kept_feats
)
elif args.handler_type == 'arff':
# Make the ARFF to be used with WEKA
arff_handler(
cohort=args.cohort, dataset=args.dataset, cluster_method=args.cluster_method, n_clusters=args.n_clusters,
iteration=args.iteration, n_kept_feats=args.n_kept_feats, clustering_score=args.clustering_score
)
elif args.handler_type == 'feat-select':
# Select the features from the ARFF using WEKA
feat_select_handler(
cohort=args.cohort, dataset=args.dataset, cluster_method=args.cluster_method, n_clusters=args.n_clusters,
iteration=args.iteration, n_kept_feats=args.n_kept_feats
)
elif args.handler_type == 'pipeline':
pipeline_handler(
cohort=args.cohort, dataset=args.dataset, cluster_method=args.cluster_method, n_clusters=args.n_clusters,
n_iterations=args.n_iterations, do_continue=args.do_continue
)
elif args.handler_type == 'best-clustering':
best_clustering_handler(cohort=args.cohort, dataset=args.dataset)
elif args.handler_type == 'counts':
counts_handler(clustering_path=args.clustering_path, feat_map_path=args.feat_map_path)
elif args.handler_type == 'plot-clustering':
plot_clustering_handler(clustering_path=args.clustering_path, data_path=args.data_path)
if __name__ == '__main__':
# Pass in the program arguments minus the name of the program
main(sys.argv[1:])