-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathtune.py
211 lines (179 loc) · 9.39 KB
/
tune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import sys
from dataclasses import replace
from pathlib import Path
from typing import *
import click
def get_local_path():
debug_local = True #to use local version
local = (Path(".") / "yspecies").resolve()
if debug_local and local.exists():
#sys.path.insert(0, Path(".").as_posix())
sys.path.insert(0, local.as_posix())
print("extending pathes with local yspecies")
print(sys.path)
return local
@click.group()
@click.option('--debug/--no-debug', default=False)
def cli(debug):
click.echo('Debug mode is %s' % ('on' if debug else 'off'))
def tune_imp(trait: str, metrics: str, trials: int, folds: int, hold_outs: int, repeats: int, not_validated_species: Union[bool, List[str]], threads: int, debug_local: bool):
from loguru import logger
local = get_local_path()
from pathlib import Path
from yspecies.config import Locations
locations: Locations = Locations("./") if Path("./data").exists() else Locations("../")
logger.add(locations.logs / "tune_errors.log", backtrace=True, diagnose=True)
logger.add(locations.logs / "tune.log", rotation="12:00") # New file is created each day at noon
logger.info(f"starting hyper-parameters optimization script with {trials} trials, {folds} folds and {hold_outs} hold outs!")
importance_type = "split"
life_history = ["lifespan", "mass_kg", "mtGC", "metabolic_rate", "temperature", "gestation_days"]
from yspecies.config import DataLoader
from yspecies.preprocess import FeatureSelection
import pprint
pp = pprint.PrettyPrinter(indent=4)
# ### Loading data ###
# Let's load data from species/genes/expressions selected by select_samples.py notebook
default_selection = FeatureSelection(
samples = ["tissue","species"], #samples metadata to include
species = [], #species metadata other then Y label to include
exclude_from_training = ["species"], #exclude some fields from LightGBM training
to_predict = trait, #column to predict
categorical = ["tissue"],
select_by = "shap",
importance_type = importance_type,
feature_perturbation = "tree_path_dependent"
)
loader = DataLoader(locations, default_selection)
selections = loader.load_life_history()
to_select = selections[trait]
optimize(folds, hold_outs, locations, metrics, repeats, to_select, trait, trials)
def optimize(folds, hold_outs, locations, metrics, repeats, to_select, trait, trials):
from sklearn.pipeline import Pipeline
from yspecies.workflow import Repeat, Collect
from yspecies.preprocess import FeatureSelection, DataExtractor
from yspecies.partition import DataPartitioner, PartitionParameters
from yspecies.selection import ShapSelector
from yspecies.tuning import Tune
from yspecies.explanations import FeatureSummary, FeatureResults
import optuna
from optuna import Trial
# ## Setting up ShapSelector ##
# Deciding on selection parameters (which fields to include, exclude, predict)
partition_params = PartitionParameters(folds, hold_outs, 2, 42)
selection = FeatureSelection(
samples=["tissue", "species"], # samples metadata to include
species=[], # species metadata other then Y label to include
exclude_from_training=["species"], # exclude some fields from LightGBM training
to_predict=trait, # column to predict
categorical=["tissue"],
select_by="shap",
importance_type="split"
)
url = f'sqlite:///' + str((locations.interim.optimization / f"{trait}.sqlite").absolute())
print('loading (if exists) study from ' + url)
storage = optuna.storages.RDBStorage(
url=url
# engine_kwargs={'check_same_thread': False}
)
study = optuna.multi_objective.study.create_study(directions=['maximize', 'minimize', 'maximize'], storage=storage,
study_name=f"{trait}_{metrics}", load_if_exists=True)
study.get_pareto_front_trials()
def objective_parameters(trial: Trial) -> dict:
return {
'objective': 'regression',
'metric': {'mae', 'mse', 'huber'},
'verbosity': -1,
'boosting_type': trial.suggest_categorical('boosting_type', ['dart', 'gbdt']),
'lambda_l1': trial.suggest_uniform('lambda_l1', 0.01, 3.0),
'lambda_l2': trial.suggest_uniform('lambda_l2', 0.01, 3.0),
'max_leaves': trial.suggest_int("max_leaves", 15, 25),
'max_depth': trial.suggest_int('max_depth', 3, 8),
'feature_fraction': trial.suggest_uniform('feature_fraction', 0.3, 1.0),
'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.3, 1.0),
'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.1),
'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 3, 8),
'drop_rate': trial.suggest_uniform('drop_rate', 0.1, 0.3),
"verbose": -1
}
optimization_parameters = objective_parameters
from yspecies.workflow import SplitReduce
def side(i: int):
print(i)
return i
prepare_partition = SplitReduce(
outputs=DataPartitioner(),
split=lambda x: [(x[0], replace(partition_params, seed=side(x[2])))],
reduce=lambda x, output: (output[0], x[1])
)
partition_and_cv = Pipeline(
[
("prepare partition", prepare_partition),
("shap_computation", ShapSelector()) # ('crossvalidator', CrossValidator())
]
)
def get_objectives(results: List[FeatureResults]) -> Tuple[float, float, float]:
summary = FeatureSummary(results)
return (summary.metrics_average.R2, summary.metrics_average.huber, summary.kendall_tau_abs_mean)
partition_and_cv_repeat = Pipeline([
("repeat_cv_pipe", Repeat(partition_and_cv, repeats, lambda x, i: [x[0], x[1], i])),
("collect_mean", Collect(fold=lambda outputs: get_objectives(outputs)))
]
)
p = Pipeline([
('extractor', DataExtractor()),
('tune', Tune(partition_and_cv_repeat, study=study, n_trials=trials, parameters_space=optimization_parameters))
])
from yspecies.tuning import MultiObjectiveResults
results: MultiObjectiveResults = p.fit_transform(to_select)
best = results.best_trials
import json
for i, t in enumerate(best):
trait_path = locations.metrics.optimization / trait
if not trait_path.exists():
trait_path.mkdir()
path = trait_path / f"{str(i)}.json"
print(f"writing parameters to {path}")
with open(path, 'w') as f:
params = t.params
values = t.values
to_write = {"number": t.number, "params": params,
"metrics": {"R2": values[0], "huber": values[1], "kendall_tau": values[2]}}
json.dump(to_write, f, sort_keys=True, indent=4)
print(f"FINISHED HYPER OPTIMIZING {trait}")
#@click.group(invoke_without_command=True)
@cli.command()
@click.option('--trait', default="lifespan", help='trait name')
@click.option('--metrics', default="r2_huber_kendall", help='metrics names')
@click.option('--trials', default=200, help='Number of trials in hyper optimization')
@click.option('--folds', default=5, help='Number of folds in cross-validation')
@click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
@click.option('--repeats', default=5, help="number of times to repeat validation")
@click.option('--not_validated_species', default=True, help="not_validated_species")
@click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
@click.option('--debug_local', default=True, help="debug local")
def tune(trait: str, metrics: str, trials: int, folds: int, hold_outs: int, repeats: int, not_validated_species: Union[bool, List[str]], threads: int, debug_local: bool):
return tune_imp(trait, metrics, trials, folds, hold_outs, repeats, not_validated_species, threads, debug_local)
@cli.command()
@click.option('--life_history', default=["lifespan", "mass_kg", "gestation_days", "mtGC", "metabolic_rate", "temperature"], help='life history list')
@click.option('--metrics', default="r2_huber_kendall", help='metrics names')
@click.option('--trials', default=10, help='Number of trials in hyper optimization')
@click.option('--folds', default=5, help='Number of folds in cross-validation')
@click.option('--hold_outs', default=1, help='Number of hold outs in cross-validation')
@click.option('--repeats', default=5, help="number of times to repeat validation")
@click.option('--not_validated_species', default=True, help="not_validated_species")
@click.option('--threads', default=1, help="number of threads (1 by default). If you put -1 it will try to utilize all cores, however it can be dangerous memorywise")
@click.option('--debug_local', default=True, help="debug local")
def tune_all(life_history: List[str],
metrics: str,
trials: int,
folds: int,
hold_outs: int,
repeats: int,
not_validated_species: Union[bool, List[str]],
threads: int,
debug_local: bool):
for trait in life_history:
print(f"tunning {trait} with {trials}")
tune_imp(trait, metrics, trials, folds, hold_outs, repeats, not_validated_species, threads, debug_local)
if __name__ == "__main__":
cli()