-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrepresentation_experiment.py
188 lines (152 loc) · 7.15 KB
/
representation_experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
PsychOrNot - Experiment 1
"""
from typing import List, Tuple
import itertools
import argparse
import logging
import json
import pandas as pd
import numpy as np
import sklearn.feature_extraction.text
import sklearn.pipeline
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics
import sklearn.svm
import sklearn.neighbors
import sklearn.linear_model
# HYPERPARAMETER_OPTIONS contain the hyperparameter options to do a grid search over:
HYPERPARAMETER_OPTIONS = {
"architecture": ['svm', 'rf', 'knn', 'lr'],
# Which set of n-grams to include
"ngram_range": [(1, 1), (2, 2), (3, 3), (4, 4), (1,2), (1, 3)],
"normalization": ["none", "tf", "tfidf"], # The normalization to apply
}
# AMINO_ACIDS is the vocabulary used to construct the features.
# These correspond to the FASTA format symbols, except for
# pyrrolysine and selenocysteine (which do not appear in the dataset)
# and special characters, which are ignored.
AMINO_ACIDS = "AGSTNQVILMFYWHPKREDC"
# LABEL_MAP is the mapping from class names to class indices.
LABEL_MAP = {'Thermo': 0, 'Psychro': 1}
# RANDOM_SEED is the initial RNG state used to divide the dataset into training-test sets
# and cross-validation folds.
RANDOM_SEED = 1
# HOLDOUT_SET_SIZE is the ratio of points to set aside for the final holdout test set (unused in this experiment.)
# In our experiment, we set aside 10% of the points for the holdout set.
HOLDOUT_SET_SIZE = 0.1
# CROSSVALIDATION_FOLDS is the number of folds to use.
CROSSVALIDATION_FOLDS = 10
# EXPERIMENT_METRICS are the accuracy metrics shown when the experiment is finished (see representation-results.csv)
EXPERIMENT_METRICS = ['train-accuracy', 'validation-accuracy']
# SORT_METRIC is the accuracy metric to rank results by (only used for displaying results)
SORT_METRIC = 'validation-accuracy'
# To ensure features appear in the same order,
# we provide a helper to compute the vocabulary for the n-gram representation:
def feature_names_for_ngram_range(vocabulary: str, ngram_range: Tuple[int, int]) -> List[str]:
"""
feature_names_for_ngram_range returns the set of possible n-grams
for the given vocabulary and n in (ngram_range)
>>> feature_names_for_ngram_range("ab", 1, 2)
['a', 'b', 'aa', 'ab', 'ba', 'bb']
"""
features = []
for n in range(ngram_range[0], ngram_range[1]+1):
features += [''.join(c)
for c in itertools.product(vocabulary, repeat=n)]
return features
def make_classifier(sweep: dict):
"""
make_classifier returns the classifier used to compare the representations.
"""
arch = sweep['architecture']
if arch == 'rf':
return sklearn.ensemble.RandomForestClassifier(max_depth=3)
elif arch == 'svm':
return sklearn.svm.SVC()
elif arch == 'knn':
return sklearn.neighbors.KNeighborsClassifier()
elif arch == 'lr':
return sklearn.linear_model.LogisticRegression()
def hyperparameter_grid(choices: dict) -> dict:
"""
hyperparameter_grid returns the list of possible choices of the hyperparameters
from the dictionary in (choices).
"""
return [dict(zip(choices, x)) for x in itertools.product(*choices.values())]
def run_sweep(sweep: dict, X_train: pd.Series, y_train: pd.Series, X_validation: pd.Series, y_validation: pd.Series) -> dict:
"""
run_sweep runs one fold of the experiment for a given hyperparameter configuration.
"""
results = {}
logging.debug(f"Evaluating with hyperparameters: {sweep}")
transform_steps = [('vectorize', sklearn.feature_extraction.text.CountVectorizer(
analyzer='char',
lowercase=False,
ngram_range=sweep["ngram_range"],
vocabulary=feature_names_for_ngram_range(
AMINO_ACIDS, sweep["ngram_range"])
))]
if sweep['normalization'] != 'none':
transform_steps.append(('normalize', sklearn.feature_extraction.text.TfidfTransformer(
use_idf=(sweep['normalization'] == 'tfidf')
)))
pipeline = sklearn.pipeline.Pipeline(transform_steps)
X_train_t = pipeline.fit_transform(X_train).toarray()
clf = make_classifier(sweep)
clf.fit(X_train_t, y_train)
y_pred = clf.predict(X_train_t)
results['train'] = sklearn.metrics.classification_report(
y_train, y_pred, output_dict=True)
X_validation_t = pipeline.transform(X_validation).toarray()
y_pred = clf.predict(X_validation_t)
results['validation'] = sklearn.metrics.classification_report(
y_validation, y_pred, output_dict=True)
return results
def main(args):
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
dataset = pd.read_csv(args.datapath, names=(
"Classification", "Label", "Sequence"))
X, y = dataset['Sequence'], dataset['Label'].map(LABEL_MAP)
X_cv, X_holdout, y_cv, y_holdout = sklearn.model_selection.train_test_split(
X, y, shuffle=True, test_size=HOLDOUT_SET_SIZE, random_state=RANDOM_SEED)
kfold = sklearn.model_selection.StratifiedKFold(
n_splits=CROSSVALIDATION_FOLDS, shuffle=True, random_state=RANDOM_SEED)
folds = []
for sweep_idx, sweep in enumerate(hyperparameter_grid(HYPERPARAMETER_OPTIONS)):
for fold_idx, (train_indices, validation_indices) in enumerate(kfold.split(X_cv, y_cv)):
logging.debug(f"Running fold {fold_idx}")
X_train, y_train = X_cv.iloc[train_indices], y_cv.iloc[train_indices]
X_validation, y_validation = X_cv.iloc[validation_indices], y_cv.iloc[validation_indices]
fold_results = run_sweep(
sweep, X_train, y_train, X_validation, y_validation)
fold_results['fold'] = fold_idx
fold_results['params'] = sweep
fold_results['sweep_id'] = sweep_idx
folds.append(pd.json_normalize(fold_results, sep="-"))
results = pd.concat(folds)
logging.info(f"Saving results to {args.results}")
results.to_csv(args.results)
logging.info(f"Sweeps sorted by {SORT_METRIC}:")
# Group the results by sweeps, calculate the mean and standard deviation of the
# metrics in EXPERIMENT_METRICS, then sort the results by descending mean of the specified
# metric (the validation accuracy by default)
result_frame = results.groupby(['sweep_id'])[EXPERIMENT_METRICS].agg(
[np.mean, np.std]).sort_values((SORT_METRIC, 'mean'), ascending=False)
# For convenience's sake, include the hyperparameter configuration with each sweep:
param_frame = results.groupby('sweep_id').first(
)[results.columns[results.columns.str.startswith('params')]]
print(result_frame.join(param_frame))
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="Compare representations for the PsychOrNot classifier.")
parser.add_argument("--datapath", default="dataset/database-pdb.csv",
help="The path to the .CSV with the experiment data.")
parser.add_argument("--results", default="representation-results.csv",
help="Where to save the results of the representation comparison.")
parser.add_argument("--verbose", action="store_true",
help="Log results to the console.")
args = parser.parse_args()
main(args)