-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
71 lines (55 loc) · 2.31 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
# Imports.
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class CustomRemover(BaseEstimator, TransformerMixin):
def __init__(self, useless_attribs):
self.useless_attribs = useless_attribs
def fit(self, X, y=None):
return self
def transform(self, X):
X_copy = X.copy()
X_copy = X_copy.drop(self.useless_attribs, axis=1)
return X_copy
class CatGrouper(BaseEstimator, TransformerMixin):
def __init__(self, cols, threshold=0.1):
self.cols = cols
self.threshold = threshold
self.cat_groups = {}
def fit(self, X, y=None):
for attrib in self.cols:
vc = X[attrib].value_counts(normalize=True)
thres = vc < self.threshold
keep = vc[np.logical_not(thres)].index
self.cat_groups[attrib] = list(keep)
return self
def _map_func(self, v, attrib):
if v not in self.cat_groups[attrib]:
v = "Other"
return v
def transform(self, X):
X_copy = X.copy()
for attrib in self.cols:
X_copy[attrib] = X_copy[attrib].apply(self._map_func, attrib=attrib)
return X_copy
def get_pred_ranked_avg(models, ranking, X_test_prep, y_test=None):
assert len(models) == len(ranking)
all_y_test_pred = []
for m, model_folds in models.items():
model_preds = []
if m == "xgboost_evalml":
# Hard-coded in.
ind = [0, 2, 3, 5, 8, 9, 10, 11, 15, 17, 22, 24, 25, 28, 29]
X_test_prep_select = X_test_prep[:, ind]
for model_fold in model_folds:
if m == "xgboost_evalml":
y_test_pred_evalml = model_fold.predict(X_test_prep_select)
else:
y_test_pred = model_fold.predict(X_test_prep)
model_preds.append(y_test_pred)
if m != "xgboost_evalml":
model_preds = np.mean(np.array(model_preds), axis=0)
all_y_test_pred.append(model_preds)
all_y_test_pred = np.array(all_y_test_pred)
# Concatenate our ensemble with EvalML's XGBoost.
all_y_test_pred = np.concatenate((all_y_test_pred, np.expand_dims(y_test_pred_evalml, 0)), axis=0)
return np.sum(all_y_test_pred * ranking[:, np.newaxis], 0)