forked from rmunro/pytorch_active_learning
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathuncertainty_sampling.py
197 lines (128 loc) · 7.23 KB
/
uncertainty_sampling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python
"""UNCERTAINTY SAMPLING
Uncertainty Sampling examples for Active Learning in PyTorch
This is an open source example to accompany Chapter 3 from the book:
"Human-in-the-Loop Machine Learning"
It contains four Active Learning strategies:
1. Least Confidence Sampling
2. Margin of Confidence Sampling
3. Ratio of Confidence Sampling
4. Entropy-based Sampling
"""
import torch
import math
from random import shuffle
__author__ = "Robert Munro"
__license__ = "MIT"
__version__ = "1.0.1"
class UncertaintySampling():
"""Active Learning methods to sample for uncertainty
"""
def __init__(self, verbose=False):
self.verbose = verbose
def least_confidence(self, prob_dist, sorted=False):
"""
Returns the uncertainty score of an array using
least confidence sampling in a 0-1 range where 1 is the most uncertain
Assumes probability distribution is a pytorch tensor, like:
tensor([0.0321, 0.6439, 0.0871, 0.2369])
Keyword arguments:
prob_dist -- a pytorch tensor of real numbers between 0 and 1 that total to 1.0
sorted -- if the probability distribution is pre-sorted from largest to smallest
"""
if sorted:
simple_least_conf = prob_dist.data[0] # most confident prediction
else:
simple_least_conf = torch.max(prob_dist) # most confident prediction
num_labels = prob_dist.numel() # number of labels
normalized_least_conf = (1 - simple_least_conf) * (num_labels / (num_labels -1))
return normalized_least_conf.item()
def margin_confidence(self, prob_dist, sorted=False):
"""
Returns the uncertainty score of a probability distribution using
margin of confidence sampling in a 0-1 range where 1 is the most uncertain
Assumes probability distribution is a pytorch tensor, like:
tensor([0.0321, 0.6439, 0.0871, 0.2369])
Keyword arguments:
prob_dist -- a pytorch tensor of real numbers between 0 and 1 that total to 1.0
sorted -- if the probability distribution is pre-sorted from largest to smallest
"""
if not sorted:
prob_dist, _ = torch.sort(prob_dist, descending=True) # sort probs so largest is first
difference = (prob_dist.data[0] - prob_dist.data[1]) # difference between top two props
margin_conf = 1 - difference
return margin_conf.item()
def ratio_confidence(self, prob_dist, sorted=False):
"""
Returns the uncertainty score of a probability distribution using
ratio of confidence sampling in a 0-1 range where 1 is the most uncertain
Assumes probability distribution is a pytorch tensor, like:
tensor([0.0321, 0.6439, 0.0871, 0.2369])
Keyword arguments:
prob_dist -- pytorch tensor of real numbers between 0 and 1 that total to 1.0
sorted -- if the probability distribution is pre-sorted from largest to smallest
"""
if not sorted:
prob_dist, _ = torch.sort(prob_dist, descending=True) # sort probs so largest is first
ratio_conf = prob_dist.data[1] / prob_dist.data[0] # ratio between top two props
return ratio_conf.item()
def entropy_based(self, prob_dist):
"""
Returns the uncertainty score of a probability distribution using
entropy
Assumes probability distribution is a pytorch tensor, like:
tensor([0.0321, 0.6439, 0.0871, 0.2369])
Keyword arguments:
prob_dist -- a pytorch tensor of real numbers between 0 and 1 that total to 1.0
sorted -- if the probability distribution is pre-sorted from largest to smallest
"""
log_probs = prob_dist * torch.log2(prob_dist) # multiply each probability by its base 2 log
raw_entropy = 0 - torch.sum(log_probs)
normalized_entropy = raw_entropy / math.log2(prob_dist.numel())
return normalized_entropy.item()
def softmax(self, scores, base=math.e):
"""Returns softmax array for array of scores
Converts a set of raw scores from a model (logits) into a
probability distribution via softmax.
The probability distribution will be a set of real numbers
such that each is in the range 0-1.0 and the sum is 1.0.
Assumes input is a pytorch tensor: tensor([1.0, 4.0, 2.0, 3.0])
Keyword arguments:
prediction -- a pytorch tensor of any positive/negative real numbers.
base -- the base for the exponential (default e)
"""
exps = (base**scores.to(dtype=torch.float)) # exponential for each value in array
sum_exps = torch.sum(exps) # sum of all exponentials
prob_dist = exps / sum_exps # normalize exponentials
return prob_dist
def get_samples(self, model, unlabeled_data, method, feature_method, number=5, limit=10000):
"""Get samples via the given uncertainty sampling method from unlabeled data
Keyword arguments:
model -- current Machine Learning model for this task
unlabeled_data -- data that does not yet have a label
method -- method for uncertainty sampling (eg: least_confidence())
feature_method -- the method for extracting features from your data
number -- number of items to sample
limit -- sample from only this many predictions for faster sampling (-1 = no limit)
Returns the number most uncertain items according to least confidence sampling
"""
samples = []
if limit == -1 and len(unlabeled_data) > 10000 and self.verbose: # we're drawing from *a lot* of data this will take a while
print("Get predictions for a large amount of unlabeled data: this might take a while")
else:
# only apply the model to a limited number of items
shuffle(unlabeled_data)
unlabeled_data = unlabeled_data[:limit]
with torch.no_grad():
v=0
for item in unlabeled_data:
text = item[1]
feature_vector = feature_method(text)
hidden, logits, log_probs = model(feature_vector, return_all_layers=True)
prob_dist = torch.exp(log_probs) # the probability distribution of our prediction
score = method(prob_dist.data[0]) # get the specific type of uncertainty sampling
item[3] = method.__name__ # the type of uncertainty sampling used
item[4] = score
samples.append(item)
samples.sort(reverse=True, key=lambda x: x[4])
return samples[:number:]