-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathduet_classifier.py
676 lines (471 loc) · 26.8 KB
/
duet_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
# Copyright 2019-2022 VMware, Inc.
# SPDX-License-Identifier: BSD-3-Clause
#!/usr/bin/env python3
import time
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_is_fitted, check_X_y, check_array
###############################################################################
###############################################################################
class DuetClassifier(BaseEstimator, ClassifierMixin):
"""
A Duet classifier.
The two main building blocks of Duet are two classifiers.
The first classifier is a small Coarse-Grained (cg) model - Random Forest
(RF) that is trained using the entire training dataset and which we use
to compute the labeled data predictability*.
The second classifier is a Fine-Grained (fg) model - XGBoost that is
trained using only a predictability-driven fraction of the training
dataset. During classification, all data instances are classified by
the RF, and only the hard data instances (i.e., cases for which the RF
is not sufficiently confident) are forwarded to the XGBoost for
reclassification.
*Predictability of an instance is defined to be the distance (l2 norm)
among the distribution vector (i.e., predict_proba) by the RF and the
'perfect' one (i.e., 1 in the correct class and 0 at all others).
For more information read: Efficient_Multi-Class_Classification_with_Duet.pdf
Parameters
----------
duet_cg_train_using_feature_subset : list or None, optional (default=None)
List of columns to use (when None, all columns are used).
duet_fg_train_sample_weight_balance : boolean, optional (default=False)
Use weights when training the fg (XGBoost) model. The original class weights
of the dataset are preserved in the predictability-driven fraction
dataset that is used for training the fg (XGBoost) model (i.e., the total weight
of each class sums up to the original total class weight of the dataset)
duet_fg_extend_data_with_cg_distribution : boolean, optional (default=False)
When True, the dataset for the training/classification of the fg (XGBoost) model
is extended with the class distribution vector by the cg (RF) classifier.
That is, the class distribution vector is added as additional features
of the dataset used for the training of the fg (XGBoost) model.
duet_fg_train_data_filter_type : string, optional (default='l2')
The distance metric that is used to compute the predictability of the data instances.
duet_fg_train_dataset_fraction : float, optional (default=0.25)
A value in (0,1]. Indicated the data fraction that is used for the
training of the fg (XGBoost) model. If duet_subsample_only is set to True,
indicates the data fraction for dataset sub-sampling.
duet_fg_test_confidence : float, optional (default=0.95)
A value in [0,1]. Indicates the data confidence (i.e.,
top-1 probability in the distribution vector) above which the instance
is not passed to the fg (XGBoost) classifier for classification (i.e., classified
only by the cg (RF) classifier). Used only with duet_fg_test=False.
duet_verbose : boolean, optional (default=False)
Verbose printing for debug. Prints warnings and the fraction of the data
that is used for the training and classification by the fg (XGBoost) classifier.
duet_random_np_seed : int, optional (default=42)
Random seed for the numpy package used in Duet.
cg_rf_params : dict or None, optional (default={'max_leaf_nodes': 1000})
Parameters for the cf (RF) classifier.
The default max_leaf_nodes parameter is used to avoid any over-fitting
by the cg (RF) model.
fg_xgb_params : dict or None, optional (default=None)
Parameters for the fg XGBoost classifier.
duet_subsample_only : boolean, optional (default=False)
When true, use duet for sub-sampling of the dataset. Fit returns the sub-sampled dataset
(X',y'), according to the duet_fg_train_dataset_fraction value.
duet_fg_test : boolean, optional (default=False)
If true, all data is classified only by the fg (XGBoost) classifier.
Attributes
----------
classes_ : array of shape (n_classes,) classes labels.
cg_clf_ : RandomForestClassifier
The cg (Random Forest) classifier.
fg_clf_ : xgboost
The fg (XGBoost) classifier.
fg_clf_fitted_ : Boolean
Remembers if fit was called for the fg model within Duet.
fit_time_, predict_time_ : float
Temporary. Used for debug measurements
Example program
---------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
cg_rf_params = {
'n_estimators': 20,
'max_leaf_nodes': 100,
}
fg_xgb_params = {
'n_estimators': 1000,
'max_depth': 8,
'learning_rate': 0.01,
}
parameters = {
'duet_fg_train_dataset_fraction': 0.1,
'duet_fg_test_confidence': 0.99,
'cg_rf_params': cg_rf_params,
'fg_xgb_params': fg_xgb_params
}
duet = DuetClassifier()
duet.set_params(**parameters)
duet.fit(X_train, y_train)
y_predicted = duet.predict(X_test)
print(classification_report(y_test, y_predicted, digits=5))
Notes
-----
The parameters controlling the size of the dataset for the fg training
(duet_fg_train_dataset_fraction) and the cg confidence level
(duet_fg_test_confidence) are advised to be specifically tuned for each dataset
(e.g., by grid-search).
The parameters for the RF and XGBoost classifiers should also be tuned.
Using the parameters that work well for the monolithic models is a good
start.
"""
###########################################################################
###########################################################################
def __init__(self,
### duet parameters
duet_cg_train_using_feature_subset=None,
duet_fg_train_sample_weight_balance=False,
duet_fg_extend_data_with_cg_distribution=False,
duet_fg_train_data_filter_type='l2',
duet_fg_train_dataset_fraction=0.25,
duet_fg_test_confidence=0.95,
duet_verbose=False,
duet_random_np_seed=42,
### arguments for internal classifiers
cg_rf_params={'max_leaf_nodes': 1000},
fg_xgb_params=None,
### use duet for dataset subsampling?
duet_subsample_only=False,
### test using only fg model?
duet_fg_test=False
):
### duet parameters
self.duet_cg_train_using_feature_subset = duet_cg_train_using_feature_subset
self.duet_fg_train_sample_weight_balance = duet_fg_train_sample_weight_balance
self.duet_fg_extend_data_with_cg_distribution = duet_fg_extend_data_with_cg_distribution
self.duet_fg_train_data_filter_type = duet_fg_train_data_filter_type
self.duet_fg_train_dataset_fraction = duet_fg_train_dataset_fraction
self.duet_fg_test_confidence = duet_fg_test_confidence
self.duet_verbose = duet_verbose
self.duet_random_np_seed = duet_random_np_seed
### kwards for internal classifiers
self.cg_rf_params = cg_rf_params
self.fg_xgb_params = fg_xgb_params
### use duet as filter only
self.duet_subsample_only = duet_subsample_only
### test using only fg model?
self.duet_fg_test = duet_fg_test
###########################################################################
###########################################################################
def verify_duet_parameters(self, X, y):
### categorial parameters are verified in code
if self.duet_fg_train_sample_weight_balance not in [True, False]:
raise Exception("Illegal duet_fg_train_sample_weight_balance value. Should be in [True, False]")
if self.duet_fg_extend_data_with_cg_distribution not in [True, False]:
raise Exception("Illegal duet_fg_extend_data_with_cg_distribution value. Should be in [True, False]")
if self.duet_fg_train_dataset_fraction < 0 or self.duet_fg_train_dataset_fraction > 1:
raise Exception("Illegal duet_fg_train_dataset_fraction value. Should be in [0, 1]")
if self.duet_fg_test_confidence < 0 or self.duet_fg_test_confidence > 1:
raise Exception("Illegal duet_fg_test_confidence value. Should be in [0, 1]")
if self.duet_verbose not in [True, False]:
raise Exception("Illegal duet_verbose value. Should be in [True, False]")
if self.duet_cg_train_using_feature_subset is not None:
### empty is not allowed
if not len(self.duet_cg_train_using_feature_subset):
raise Exception("Illegal duet_cg_train_using_feature_subset (err1): {}\nShould be None or specify unique columns".format(self.duet_cg_train_using_feature_subset))
### duplicates are not allowed
if len(self.duet_cg_train_using_feature_subset) != len(set(self.duet_cg_train_using_feature_subset)):
raise Exception("Illegal duet_cg_train_using_feature_subset (err2): {}\nShould be None or specify unique columns".format(self.duet_cg_train_using_feature_subset))
### translate column names (if X is a dataframe) to indices
if isinstance(X, pd.DataFrame):
if all(elem in X.columns for elem in self.duet_cg_train_using_feature_subset):
self.duet_cg_train_using_feature_subset = [X.columns.get_loc(i) for i in self.duet_cg_train_using_feature_subset]
### verify legal column values
if not set(self.duet_cg_train_using_feature_subset).issubset(set(range(X.shape[1]))):
raise Exception("Illegal duet_cg_train_using_feature_subset (err3): {}\nShould be None or specify unique columns".format(self.duet_cg_train_using_feature_subset))
###########################################################################
###########################################################################
def fit(self, X, y):
"""
Fit estimator.
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
The input samples.
y : array-like, shape=(n_samples,)
The input sample labels.
Returns
-------
self : object
"""
### set numpy seed
np.random.seed(self.duet_random_np_seed)
### duet measurments
self.fit_time_ = {}
self.predict_time_ = {}
if self.duet_subsample_only == True:
Xc, yc = X, y
### input verification - required by scikit
X, y = check_X_y(X, y)
### duet parameters input checks
self.verify_duet_parameters(X, y)
### store the classes seen during fit - required by scikit
self.classes_ = unique_labels(y)
### init coarse-grained (cg) - random forest
self.cg_clf_ = RandomForestClassifier()
if self.cg_rf_params is None:
print("\nWarning: no kwards for the coarse-grained model.\n")
else:
self.cg_clf_.set_params(**self.cg_rf_params)
start = time.time()
### train cg - then, classify training data by cg and obtain classification distribution
if self.duet_cg_train_using_feature_subset == None:
self.cg_clf_.fit(X, y)
cg_train_dataset_classifications_distribution = self.cg_clf_.predict_proba(X)
else:
### train cg by features subset specified by self.duet_cg_train_using_feature_subset
self.cg_clf_.fit(X[:, self.duet_cg_train_using_feature_subset], y)
cg_train_dataset_classifications_distribution = self.cg_clf_.predict_proba(X[:, self.duet_cg_train_using_feature_subset])
end = time.time()
self.fit_time_['cg'] = end-start
### filter data
train_filters = {
'l2': self.l2_filter
}
if self.duet_fg_train_data_filter_type not in train_filters:
raise Exception("\nUnknown filter type: {}\n".format(self.duet_fg_train_data_filter_type))
filtered_data = train_filters[self.duet_fg_train_data_filter_type](cg_train_dataset_classifications_distribution, y)
if self.duet_subsample_only == True:
return Xc[filtered_data], yc[filtered_data]
### train the fine-grained (fg)?
if np.sum(filtered_data) > 0:
### useful stat
if self.duet_verbose:
print("\nTraining a fine-grained classifier (XGBoost) with {}[%] of the data\n".format(100*np.sum(filtered_data)/len(filtered_data)))
### extend X_train with cg confidence for the fg training?
if self.duet_fg_extend_data_with_cg_distribution:
X = np.concatenate((X, cg_train_dataset_classifications_distribution), axis=1)
### init fg - xgboost
self.fg_clf_ = xgb.XGBClassifier()
if self.fg_xgb_params is None:
### useful stat
if self.duet_verbose:
print("\nWarning: no kwards for the fine-grained model.\n")
else:
self.fg_clf_.set_params(**self.fg_xgb_params)
start = time.time()
### train fg + balance
if self.duet_fg_train_sample_weight_balance:
filtered_instances_per_class = np.bincount(y[filtered_data]).astype('float')
filtered_class_weights = np.max(filtered_instances_per_class) * np.reciprocal(filtered_instances_per_class, where=(filtered_instances_per_class>0))
filtered_sample_weights = np.take(filtered_class_weights, y[filtered_data])
self.fg_clf_.fit(X[filtered_data], y[filtered_data], sample_weight=filtered_sample_weights)
### train fg + no balance
else:
self.fg_clf_.fit(X[filtered_data], y[filtered_data])
### set the fg_clf as fitted
self.fg_clf_fitted_ = True
end = time.time()
self.fit_time_['fg'] = end-start
else:
### useful stat
if self.duet_verbose:
print("\nWarning: no training data for the fine-grained model.\n")
### set fg_clf as non-fitted
self.fg_clf_fitted_ = False
### a call to fit should return the classifier - required by scikit
return self
###########################################################################
###########################################################################
def predict_basic(self, X, proba=False, return_filter=False):
"""
Predict labels for X rows.
Parameters
----------
X : array-like or sparse matrix, shape=(n_samples, n_features)
The input samples.
proba : Boolean
If True, proba is returned.
return_filter : if True, returns a boolean array of size (n_samples,)
indicating if the sample was classified by the cg
model (False) or the fg model (True).
Returns
-------
y : nparray of class labels or class distributions
for X, shape=(n_samples,) or shape=(n_samples, n_classes).
or (y, filter).
"""
### is used for subsampling?
if self.duet_subsample_only == True:
raise Exception("\n Cannot predict since: self.duet_subsample_only == True\n")
### set numpy seed
np.random.seed(self.duet_random_np_seed)
### check is that fit had been called - required by scikit
check_is_fitted(self)
### input verification - required by scikit
X_test = check_array(X)
### no fg model
if not self.fg_clf_fitted_:
### useful stat
if self.duet_verbose:
print("\nWarning: no fine-grained model. Predict only based on the coarse-grained model\n")
if self.duet_cg_train_using_feature_subset == None:
if proba:
pp = self.cg_clf_.predict_proba(X_test)
if return_filter:
return (pp, np.zeros(len(pp), dtype=bool))
else:
return pp
else:
p = self.cg_clf_.predict(X_test)
if return_filter:
return (p, np.zeros(len(p), dtype=bool))
else:
return p
else:
if proba:
pp = self.cg_clf_.predict_proba(X_test[:, self.duet_cg_train_using_feature_subset])
if return_filter:
return (pp, np.zeros(len(pp), dtype=bool))
else:
return pp
else:
p = self.cg_clf_.predict(X_test[:, self.duet_cg_train_using_feature_subset])
if return_filter:
return (p, np.zeros(len(p), dtype=bool))
else:
return p
### fg model exists
else:
start = time.time()
if self.duet_cg_train_using_feature_subset == None:
### cg classifications distribution
classifications_distribution = self.cg_clf_.predict_proba(X_test)
else:
### cg classifications distribution
classifications_distribution = self.cg_clf_.predict_proba(X_test[:, self.duet_cg_train_using_feature_subset])
### cg classification
classifications = self.classes_.take(np.argmax(classifications_distribution, axis=1), axis=0)
'''
inconsistency = self.cg_clf_.predict(X_test) != classifications
if any(inconsistency):
raise Exception("\nPredict error: predict_proba inconsistency\n")
'''
end = time.time()
self.predict_time_['cg'] = end-start
### calculate classification confidence level
classification_confidence = np.amax(classifications_distribution, axis=1)
if self.duet_fg_test == True:
### all
filtered_data = [True]*len(X_test)
else:
### low confidence only
filtered_data = classification_confidence <= self.duet_fg_test_confidence
### predict by the fg model?
if np.sum(filtered_data) > 0:
### useful stat
if self.duet_verbose:
print("\nPredict {}[%] of the data by the fine-grained model\n".format(100*np.sum(filtered_data)/len(filtered_data)))
### extend X_test with cg confidence for the fg prediction?
if self.duet_fg_extend_data_with_cg_distribution:
X_test = np.concatenate((X_test, classifications_distribution), axis=1)
'''
inconsistency = self.classes_.take(np.argmax(classifications_distribution, axis=1), axis=0) != classifications
if any(inconsistency):
raise Exception("\nPredict error: predict_proba inconsistency\n")
'''
else:
print("\nWarning: no test data for prediction by the fine-grained model\n")
if proba:
### fg classifications distribution?
if np.sum(filtered_data) > 0:
classifications_distribution[filtered_data] = self.compile_predict_proba(classifications_distribution[filtered_data], self.fg_clf_.predict_proba(X_test[filtered_data]))
if return_filter:
return (classifications_distribution, filtered_data)
else:
return classifications_distribution
else:
### fg classifications
start = time.time()
classifications[filtered_data] = self.fg_clf_.predict(X_test[filtered_data])
end = time.time()
self.predict_time_['fg'] = end-start
if return_filter:
return (classifications, filtered_data)
else:
return classifications
###########################################################################
###########################################################################
def predict(self, X, return_filter=False):
return self.predict_basic(X, False, return_filter)
def predict_proba(self, X, return_filter=False):
return self.predict_basic(X, True, return_filter)
def compile_predict_proba(self, cg_predict_proba, fg_predict_proba):
return fg_predict_proba
###########################################################################
###########################################################################
def l2_filter(self, cg_train_dataset_classifications_distribution, cg_train_dataset_labels):
predictability = []
for distribution, label in zip(cg_train_dataset_classifications_distribution, cg_train_dataset_labels):
vec = np.zeros(len(distribution))
vec[np.where(self.classes_ == label)[0][0]] = 1
predictability.append(1 - 0.5*np.linalg.norm(np.subtract(distribution, vec), 2))
return self.fg_train_data_filter(np.asarray(predictability), cg_train_dataset_labels)
###########################################################################
###########################################################################
def fg_train_data_filter(self, predictability, cg_train_dataset_labels):
### number of total instances
num_total_instances = len(predictability)
### number of fg instances
num_fg_instances = int(self.duet_fg_train_dataset_fraction*num_total_instances)
### per-class low-predictability instances upper limit
num_fg_class_instances = max(int((0.5*num_fg_instances)/len(self.classes_)), 1)
### init to false array
indices = np.zeros(num_total_instances, dtype=bool)
### balanced sampling from each class
for cls in self.classes_:
### class indices
cls_indices = np.flatnonzero(cg_train_dataset_labels == cls)
indices = np.logical_or(indices, self.fg_train_data_filter_h(predictability, cls_indices, num_fg_class_instances))
### global balanced sampling
indices_to_quota = np.flatnonzero(np.logical_not(indices))
indices_to_quota_len = num_fg_instances - sum(indices)
indices = np.logical_or(indices, self.fg_train_data_filter_h(predictability, indices_to_quota, indices_to_quota_len))
'''
import matplotlib.pyplot as plt
n, bins, patches = plt.hist([predictability[indices], predictability], 10, stacked=False, log=True)
plt.show()
'''
### return the selected instances marked as 'True'
return indices
###########################################################################
###########################################################################
def fg_train_data_filter_h(self, predictability, relevant_indices, num_instances):
### init to false array
indices = np.zeros(len(predictability), dtype=bool)
### calculate the bin number of each instance - instance_predictability_bins
predictability_bins = np.linspace(min(predictability[relevant_indices]), max(predictability[relevant_indices]), 10)
instance_predictability_bins = np.digitize(predictability[relevant_indices], predictability_bins, right=True)
### calculate the size of each bin
bin_count = np.bincount(instance_predictability_bins).astype('float')
### bin-search for instance count upper bound to take from each bin
l = 0
r = num_instances
while l <= r:
k = l + (r - l)/2
current_sum = sum([min(k,i) for i in bin_count])
if current_sum == num_instances:
l = r + 1
elif current_sum < num_instances:
l = k + 1
else:
r = k - 1
### now k is the number of instances we take from each bin
for b in np.unique(instance_predictability_bins):
### skip empty bins
if bin_count[b] > 0:
### take entire bin
if bin_count[b] <= k:
np.put(indices, relevant_indices[np.flatnonzero(instance_predictability_bins == b)], np.ones(int(bin_count[b]), dtype=bool))
### sample from bin
else:
sample_from = np.flatnonzero(instance_predictability_bins == b)
sampled = np.random.choice(sample_from, int(k), replace=False)
np.put(indices, relevant_indices[sampled], np.ones(int(k), dtype=bool))
return indices
###########################################################################
###########################################################################