-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.c
145 lines (117 loc) · 4.26 KB
/
train.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#include <stdio.h>
#include <stdlib.h>
#include "structs.h"
#include "meth.h"
#include "util.h"
// compute weighted input in layer l
Matrix * weightedInput(Matrix * w_l, Matrix * b_l, Matrix * a_prev) {
Matrix * weighted = dot(w_l, a_prev);
Matrix * plusBias = add(weighted, b_l);
freeMatrix(weighted);
return plusBias;
}
// compute error in last layer of network relative to weighted input and expected output
Matrix * lastError(Matrix * z, Matrix * y) {
Matrix * a_L = softMax(z);
Matrix * neg_y = scale(-1, y);
Matrix * diff = add(a_L, neg_y);
Matrix * sigPrimeZ = sigP(z);
Matrix * err = hadamard(diff, sigPrimeZ);
freeMatrix(a_L);
freeMatrix(neg_y);
freeMatrix(diff);
freeMatrix(sigPrimeZ);
return err;
}
// compute error in lth layer in terms of error in (l+1)th layer
Matrix * error(Matrix * z, Matrix * w_l_next, Matrix * d_next) {
Matrix * weightT = transpose(w_l_next);
Matrix * w_dot_delta = dot(weightT, d_next);
Matrix * sigPrimeZ = sigP(z);
Matrix * err = hadamard(w_dot_delta, sigPrimeZ);
freeMatrix(weightT);
freeMatrix(w_dot_delta);
freeMatrix(sigPrimeZ);
return err;
}
// calculate a weight gradient given delta and previous activation
Matrix * weightGradient(Matrix * d, Matrix * a_prev) {
Matrix * a_trans = transpose(a_prev);
Matrix * d_dot_a_trans = dot(d, a_trans);
freeMatrix(a_trans);
return d_dot_a_trans;
}
// train a given network on a given dataset using batch gradient descent
void train(NeuralNetwork * n, DataSet * training, int batchSize, float learningRate, int epochs) {
// init net to keep track of gradients for all weights and biases (identical structure)
NeuralNetwork * gradientNet = initNN(n->numberOfLayers, paramCopy(n->params, n->numberOfLayers));
int L = n->numberOfLayers - 1; // number of weight matrices / bias vectors
// allocate arrays of z and delta vectors to hold weighted input and error, respectively
Matrix ** z = malloc(L * sizeof(Matrix));
Matrix ** delta = malloc(L * sizeof(Matrix));
int b, p, l, e; // iterating variables
Matrix *prevAct, *oldBiases, *oldWeights, *gradient; // temp matrix pointers
// for every epoch
for (e = 0; e < epochs; e++) {
printf("Epoch %d\n", e);
shuffle(training); // shuffle training data
// for every batch
for (b = 0; b < training->size; b += batchSize) {
// for every pair within batch
for (p = b; p < b + batchSize; p++) {
// for every layer, propagate forward
for (l = 0; l < L; l++) {
// compute weighted input to lth layer relative to activation in previous
prevAct = l == 0 ? training->inputs[p] : sig(z[l - 1]);
z[l] = weightedInput(n->w[l], n->b[l], prevAct);
if (l > 0) freeMatrix(prevAct);
}
// moving backward through network (they call it BACKpropagate for a reason)
for (l = L - 1; l >= 0; l--) {
if (l == L - 1) {
// compute error at last layer
delta[l] = lastError(z[l], training->outputs[p]);
} else {
// recursively calculate error at lth layer
delta[l] = error(z[l], n->w[l + 1], delta[l + 1]);
// next layer error no longer needed
freeMatrix(delta[l + 1]);
}
// add to sum of bias gradients
oldBiases = gradientNet->b[l];
gradientNet->b[l] = add(gradientNet->b[l], delta[l]);
freeMatrix(oldBiases);
// add to sum of weight gradients
prevAct = l == 0 ? training->inputs[p] : sig(z[l - 1]);
oldWeights = gradientNet->w[l];
gradient = weightGradient(delta[l], prevAct);
gradientNet->w[l] = add(gradientNet->w[l], gradient);
freeMatrix(gradient);
freeMatrix(oldWeights);
if (l > 0) freeMatrix(prevAct);
freeMatrix(z[l]); // this layer z no longer needed
}
freeMatrix(delta[0]); // free first layer error
}
// make averaged changes to weights / biases
for (l = 0; l < L; l++) {
gradient = scale(-learningRate / batchSize, gradientNet->w[l]);
oldWeights = n->w[l];
n->w[l] = add(n->w[l], gradient);
freeMatrix(oldWeights);
freeMatrix(gradient);
gradient = scale(-learningRate / batchSize, gradientNet->b[l]);
oldBiases = n->b[l];
n->b[l] = add(n->b[l], gradient);
freeMatrix(oldBiases);
freeMatrix(gradient);
// reset gradient net values
zero(gradientNet->w[l]);
zero(gradientNet->b[l]);
}
}
}
free(z);
free(delta);
freeNetwork(gradientNet);
}