-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoptimizers.py
180 lines (156 loc) · 7.35 KB
/
optimizers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# optimizers.py
from abc import ABC, abstractmethod
from collections import Counter
import numpy as np
from typing import List
class Optimizer(ABC):
"""
Optimizer that aims to *maximize* a given function.
"""
def score(self, feats: List[int]):
"""
:param feats: List[int] feature vector indices (i.e., sparse representation of a feature vector)
:return: floating-point score
"""
i = 0
score = 0.0
while i < len(feats):
score += self.access(feats[i])
i += 1
return score
@abstractmethod
def apply_gradient_update(self, gradient: Counter, batch_size: int):
pass
@abstractmethod
def access(self, i: int):
pass
@abstractmethod
def get_final_weights(self):
pass
#
class SGDOptimizer(Optimizer):
"""
SGD optimizer implementation, designed to have the same interface as the Adagrad optimizers
Attributes:
weights: numpy array containing initial settings of the weights. Usually initialize to the 0 vector unless
you have a very good reason not to.
alpha: step size
"""
def __init__(self, init_weights: np.ndarray, alpha):
self.weights = init_weights
self.alpha = alpha
def apply_gradient_update(self, gradient: Counter, batch_size: int):
"""
Take a sparse representation of the gradient and make an update, normalizing by the batch size to keep
hyperparameters constant as the batch size is varied
:param gradient: Counter containing the gradient values (i.e., sparse representation of the gradient)
:param batch_size: how many examples the gradient was computed on
:return: nothing, modifies weights in-place
"""
for i in gradient.keys():
self.weights[i] = self.weights[i] + self.alpha * gradient[i]
def access(self, i: int):
"""
:param i: index of the weight to access
:return: value of that weight
"""
return self.weights[i]
def get_final_weights(self):
return self.weights
#
class L1RegularizedAdagradTrainer(Optimizer):
"""
Wraps a weight vector and applies the Adagrad update using second moments of features to make custom step sizes.
This version incorporates L1 regularization: while this regularization should be applied to squash the feature vector
on every gradient update, we instead evaluate the regularizer lazily only when the particular feature is touched
(either by gradient update or by access). approximate lets you turn this off for faster access, but regularization is
now applied somewhat inconsistently.
See section 5.1 of http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf for more details
"""
def __init__(self, init_weights, lamb=1e-8, eta=1.0, use_regularization=False, approximate=True):
"""
:param init_weights: a numpy array of the correct dimension, usually initialized to 0
:param lamb: float lambda constant for the regularizer. Values above 0.01 will often cause all features to be zeroed out.
:param eta: float step size. Values from 0.01 to 10 often work well.
:param use_regularization:
:param approximate: turns off gradient updates on access, only uses them when weights are written to.
So regularization is applied inconsistently, but it makes things faster.
"""
self.weights = init_weights
self.lamb = lamb
self.eta = eta
self.use_regularization = use_regularization
self.approximate = approximate
self.curr_iter = 0
self.last_iter_touched = [0 for i in range(0, self.weights.shape[0])]
self.diag_Gt = np.zeros_like(self.weights, dtype=float)
def apply_gradient_update(self, gradient: Counter, batch_size: int):
"""
Take a sparse representation of the gradient and make an update, normalizing by the batch size to keep
hyperparameters constant as the batch size is varied
:param gradient Counter containing the gradient values (i.e., sparse representation of the gradient)
:param batch_size: how many examples the gradient was computed on
:return: nothing, modifies weights in-place
"""
batch_size_multiplier = 1.0 / batch_size
self.curr_iter += 1
for i in gradient.keys():
xti = self.weights[i]
# N.B. We negate the gradient here because the Adagrad formulas are all for minimizing
# and we're trying to maximize, so think of it as minimizing the negative of the objective
# which has the opposite gradient
# See section 5.1 in http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf for more details
# eta is the step size, lambda is the regularization
gti = -gradient[i] * batch_size_multiplier
old_eta_over_Htii = self.eta / (1 + np.sqrt(self.diag_Gt[i]))
self.diag_Gt[i] += gti * gti
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
new_xti = xti - eta_over_Htii * gti
# Apply the regularizer for every iteration since touched
iters_since_touched = self.curr_iter - self.last_iter_touched[i]
self.last_iter_touched[i] = self.curr_iter
self.weights[i] = np.sign(new_xti) * max(0, np.abs(new_xti) - self.lamb * eta_over_Htii - (iters_since_touched - 1) * self.lamb * old_eta_over_Htii)
def access(self, i: int):
"""
:param i: index of the weight to access
:return: value of that weight
"""
if not self.approximate and self.last_iter_touched[i] != self.curr_iter:
xti = self.weights[i]
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
iters_since_touched = self.curr_iter - self.last_iter_touched[i]
self.last_iter_touched[i] = self.curr_iter
self.weights[i] = np.sign(xti) * max(0, np.abs(xti) - iters_since_touched * self.lamb * self.eta * eta_over_Htii);
return self.weights[i]
def get_final_weights(self):
"""
:return: a numpy array containing the final weight vector values -- manually calls access to force each weight to
have an up-to-date value.
"""
for i in range(0, self.weights.shape[0]):
self.access(i)
return self.weights
class UnregularizedAdagradTrainer(Optimizer):
"""
Applies the Adagrad update with no regularization. Will be substantially faster than the L1 regularized version
due to less computation required to update each feature. Same interface as the regularized version.
"""
def __init__(self, init_weights, eta=1.0):
self.weights = init_weights
self.eta = eta
self.diag_Gt = np.zeros_like(self.weights, dtype=float)
def apply_gradient_update(self, gradient: Counter, batch_size: int):
batch_size_multiplier = 1.0 / batch_size
for i in gradient.keys():
xti = self.weights[i]
gti = -gradient[i] * batch_size_multiplier
self.diag_Gt[i] += gti * gti
Htii = 1 + np.sqrt(self.diag_Gt[i])
eta_over_Htii = self.eta / Htii
self.weights[i] = xti - eta_over_Htii * gti
def access(self, i: int):
return self.weights[i]
def get_final_weights(self):
return self.weights