-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathcarbm_nnet.py
213 lines (169 loc) · 7.24 KB
/
carbm_nnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
"""
Code provided by Kevin Swersky, Danny Tarlow, Ilya Sutskever, Ruslan Salakhutdinov, Rich Zemel and Ryan Adams.
Permission is granted for anyone to copy, use, modify, or distribute this
program and accompanying programs and documents for any purpose, provided
this copyright notice is retained and prominently displayed, along with
a note saying that the original programs are available from our
web page.
The programs and documents are distributed without any warranty, express or
implied. As the programs were written for research purposes only, they have
not been tested to the degree that would be advisable in any important
application. All use of these programs is entirely at the user's own risk.
This code implements the methods described in the paper:
Cardinality Restricted Boltzmann Machines. NIPS 2012.
"""
import nnet_utils as nu
import numpy as np
import display_filters as d
import load_datasets as ld
import pylab
import chain_alg_wrapper as caw
import time
import sys
import os
def carbm_nnet(W, bh, O, bo, data, targets, kmin, kmax, compute_grad=False, l2=0, r=None, ca=None):
"""
One layer feed-forward CaRBM neural network. This code may act unstable from time to time
due to the message passing being in exp space instead of log space. Please email
the authors if you desperately need this fixed.
Dead units may be a problem so pretraining is recommended. KL sparsity could also be used.
data - NxD matrix of features.
targets - NxC matrix of class labels (encoded as 1-hot row vectors).
W - input weights.
bh - input biases.
O - output weights.
bo - output biases.
kmin - Minimum number of active units.
kmax - Maximum number of active units.
compute_grad - Return the gradient if true.
l2 - L2 penalty strength.
r - Finite difference step length (if computing gradient).
ca - A ChainAlg object, this is just useful to pass in if you don't want one constructed
each time this function is called.
"""
N,D = data.shape
C = targets.shape[1]
nh = W.shape[1]
if (ca is None):
ca = caw.ChainAlg(W.shape[1], kmin, kmax, N)
node_potentials = np.dot(data,W)+bh
exp_node_potentials = np.exp(node_potentials)
#The marginals are the nonlinearities
#The samples are useless for the neural net, but it's built into the inference code
marginals, samples = ca.infer(exp_node_potentials, storage=0)
#Make predictions
total_output = np.dot(marginals,O)+bo
#Compute the objective
obj = np.sum(np.sum(total_output*targets,1) - nu.logsumexp(total_output,1))/N
#L2 penalties
obj -= 0.5*l2*np.sum(O**2)
obj -= 0.5*l2*np.sum(W**2)
if compute_grad:
class_probs = np.exp(total_output - np.tile(nu.logsumexp(total_output,1)[:,np.newaxis],(1,C)))
error_signal = targets - class_probs
#Derivatives wrt output weights and biases
dO = np.dot(marginals.T,error_signal)/N
dbo = np.sum(error_signal,0)/N
#Very accurate approximate marginals via one-sided finite-difference approximations
#(using only one additional call to inference)
#See "Implicit Differentiation by Perturbation", Justin Domke (NIPS 2010)
d_loss_d_marginals = np.dot(error_signal,O.T)
#This step length is recommended in Domke's paper.
if (r is None):
eps = 2.220446049250313e-16
r = np.sqrt(eps)*(1+np.abs(node_potentials).max())/(np.abs(d_loss_d_marginals).max())
new_node_potentials = node_potentials + r*d_loss_d_marginals
new_exp_node_potentials = np.exp(new_node_potentials)
perturbed_marginals, junknew = ca.infer(new_exp_node_potentials, storage=1)
dW = np.dot(data.T,(perturbed_marginals-marginals)/r)/N
dbh = np.sum((perturbed_marginals-marginals)/r,0)/N
#L2 penalty gradients
dW -= l2*W
dO -= l2*O
#We return the negative obj and gradients since we will be doing minimization
return -obj, {'gW':-dW, 'gbh':-dbh, 'gO':-dO, 'gbo':-dbo}
def carbm_nnet_classify(W, bh, O, bo, data, kmin, kmax, ca=None):
N,D = data.shape
nh = W.shape[1]
if (ca is None):
ca = caw.ChainAlg(W.shape[1], kmin, kmax, N)
node_potentials = np.dot(data,W)+bh
exp_node_potentials = np.exp(node_potentials)
#The marginals are the nonlinearities
#The samples are useless for the neural net, but it's built into the inference code
marginals, samples = ca.infer(exp_node_potentials, storage=0)
#Make predictions
total_output = np.dot(marginals,O)+bo
return total_output.argmax(1)
def train_carbm_nnet(data, targets, **kwargs):
#The default settings are NOT optimally tuned, so hyperparameter tuning is required.
init_params = kwargs.get('init_params',None)
num_hid = kwargs.get('num_hid',100)
eta = kwargs.get('eta',0.1)
mo = kwargs.get('mo',0)
kmin = kwargs.get('kmin',10)
kmax = kwargs.get('kmax',10)
num_iters = kwargs.get('num_iters',10)
batch_size = kwargs.get('batch_size',100)
l2 = kwargs.get('l2',1e-5)
if kmin is not None and kmax is not None:
ca = caw.ChainAlg(num_hid,kmin,kmax,batch_size)
#These are used for momentum, they are not gradients like those defined above.
dW = 0
dbh = 0
dO = 0
dbo = 0
N,D = data.shape
C = targets.shape[1]
num_batches = np.ceil(np.double(N)/batch_size)
if init_params is None:
W = 0.1*np.random.randn(D,num_hid)
bh = np.zeros(num_hid)
O = 0.1*np.random.randn(num_hid,C)
bo = np.zeros(C)
else:
W = init_params['W']
bh = init_params['bh']
O = init_params['O']
bo = init_params['bo']
for i in range(num_iters):
obj = 0
randIndices = np.random.permutation(N)
iteration_obj = 0
for batch in range(int(num_batches)):
print 'Iteration %d batch %d of %d\r' % (i+1,batch+1,num_batches),
batch_X = data[randIndices[np.mod(range(batch*batch_size,(batch+1)*batch_size),N)]]
batch_y = targets[randIndices[np.mod(range(batch*batch_size,(batch+1)*batch_size),N)]]
obj,G = carbm_nnet(W,bh,O,bo,
batch_X,batch_y,
kmin,kmax,
compute_grad=True,
l2=l2,
ca=ca)
iteration_obj += obj/num_batches
gW = G['gW']
gbh = G['gbh']
gO = G['gO']
gbo = G['gbo']
dW = mo*dW - eta*gW
dbh = mo*dbh - eta*gbh
dO = mo*dO - eta*gO
dbo = mo*dbo - eta*gbo
W = W + dW
bh = bh + dbh
O = O + dO
bo = bo + dbo
print 'Iteration %d Complete. Objective value: %s' % (i+1,iteration_obj)
return W,bh,O,bo
if __name__ == '__main__':
np.random.seed(1)
X,y,Xtest,ytest = ld.load_mnist(range(10))[:4]
kmin = 10
kmax = 10
W,bh,O,bo = train_carbm_nnet(X,y,mo=0.5,kmin=kmin,kmax=kmax)
yhat = carbm_nnet_classify(W,bh,O,bo,Xtest,10,10)
err = (yhat != ytest.argmax(1)).mean()
print 'Classification error: %s' % err
pylab.ioff()
d.print_aligned(W)
pylab.show(block=True)