2 * mlp-mnist is an implementation of a multi-layer neural network.
4 * Copyright (c) 2008 Idiap Research Institute, http://www.idiap.ch/
5 * Written by Francois Fleuret <francois.fleuret@idiap.ch>
7 * This file is part of mlp-mnist.
9 * mlp-mnist is free software: you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License version 3 as
11 * published by the Free Software Foundation.
13 * mlp-mnist is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with mlp-mnist. If not, see <http://www.gnu.org/licenses/>.
27 const scalar_t MultiLayerPerceptron::output_amplitude = 0.95;
29 // I won't comment the natural elegance of C++ in that kind of
30 // situation. IT SUCKS.
32 MultiLayerPerceptron::MultiLayerPerceptron(const MultiLayerPerceptron &mlp) {
33 _nb_layers = mlp._nb_layers;
34 _layer_sizes = new int[_nb_layers];
35 _frozen_layers = new bool[_nb_layers-1];
36 memcpy((void *) _frozen_layers, (void *) mlp._frozen_layers, _nb_layers * sizeof(bool));
37 _activations_index = new int[_nb_layers];
38 _weights_index = new int[_nb_layers];
43 for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = mlp._layer_sizes[n];
45 for(int n = 0; n < _nb_layers; n++) {
46 _activations_index[n] = _nb_activations;
47 _nb_activations += _layer_sizes[n];
48 if(n < _nb_layers - 1) {
49 _frozen_layers[n] = false;
50 _weights_index[n] = _nb_weights;
51 _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
55 _activations = new scalar_t[_nb_activations];
56 _pre_sigma_activations = new scalar_t[_nb_activations];
58 _weights = new scalar_t[_nb_weights];
59 memcpy((void *) _weights, (void *) mlp._weights, _nb_weights * sizeof(scalar_t));
62 MultiLayerPerceptron::MultiLayerPerceptron(int nb_layers, int *layer_sizes) {
63 _nb_layers = nb_layers;
64 _layer_sizes = new int[_nb_layers];
65 _frozen_layers = new bool[_nb_layers-1];
66 _activations_index = new int[_nb_layers];
67 _weights_index = new int[_nb_layers];
72 for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = layer_sizes[n];
74 for(int n = 0; n < _nb_layers; n++) {
75 _activations_index[n] = _nb_activations;
76 _nb_activations += _layer_sizes[n];
77 if(n < _nb_layers - 1) {
78 _frozen_layers[n] = false;
79 _weights_index[n] = _nb_weights;
80 _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
84 _activations = new scalar_t[_nb_activations];
85 _pre_sigma_activations = new scalar_t[_nb_activations];
87 _weights = new scalar_t[_nb_weights];
90 MultiLayerPerceptron::MultiLayerPerceptron(istream &is) {
93 _layer_sizes = new int[_nb_layers];
94 _frozen_layers = new bool[_nb_layers - 1];
95 _activations_index = new int[_nb_layers];
96 _weights_index = new int[_nb_layers];
101 for(int n = 0; n < _nb_layers; n++) is >> _layer_sizes[n];
103 for(int n = 0; n < _nb_layers; n++) {
104 _activations_index[n] = _nb_activations;
105 _nb_activations += _layer_sizes[n];
106 if(n < _nb_layers - 1) {
107 _frozen_layers[n] = false;
108 _weights_index[n] = _nb_weights;
109 _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
113 _activations = new scalar_t[_nb_activations];
114 _pre_sigma_activations = new scalar_t[_nb_activations];
116 _weights = new scalar_t[_nb_weights];
118 for(int l = 0; l < _nb_layers - 1; l++) {
122 cerr << "Inconsistent layer number during loading!\n";
126 for(int j = 0; j < _layer_sizes[l]; j++)
127 for(int i = 0; i < _layer_sizes[l+1]; i++)
128 is >> _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j];
132 MultiLayerPerceptron::~MultiLayerPerceptron() {
134 delete[] _activations;
135 delete[] _pre_sigma_activations;
137 delete[] _layer_sizes;
138 delete[] _frozen_layers;
139 delete[] _weights_index;
140 delete[] _activations_index;
143 void MultiLayerPerceptron::save(ostream &os) {
144 os << _nb_layers << "\n";
145 for(int n = 0; n < _nb_layers; n++) os << _layer_sizes[n] << (n < _nb_layers - 1 ? " " : "\n");
146 for(int l = 0; l < _nb_layers - 1; l++) {
148 for(int j = 0; j < _layer_sizes[l]; j++) {
149 for(int i = 0; i < _layer_sizes[l+1]; i++)
150 os << _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j] << (i < _layer_sizes[l+1] - 1 ? " " : "\n");
155 void MultiLayerPerceptron::save_data() {
156 for(int i = 0; i < _layer_sizes[1]; i++) {
158 sprintf(buffer, "/tmp/hidden_%03d.dat", i);
159 ofstream stream(buffer);
160 for(int j = 0; j < _layer_sizes[0]; j++) {
161 if(j%28 == 0) stream << "\n";
162 stream << j%28 << " " << j/28 << " " << _weights[i * (_layer_sizes[0] + 1) + j] << "\n";
167 void MultiLayerPerceptron::init_random_weights(scalar_t stdd) {
168 for(int w = 0; w < _nb_weights; w++) _weights[w] = normal_sample() * stdd;
171 void MultiLayerPerceptron::compute_gradient_1s(ImageSet *is, int p, scalar_t *gradient_1s) {
172 scalar_t dactivations[_nb_activations];
174 compute_activations_1s(is, p);
177 for(int l = 0; l < _nb_layers - 1; l++) if(!_frozen_layers[l]) nb_unfrozen++;
179 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
181 if(is->label(p) == i) correct = output_amplitude;
182 else correct = - output_amplitude;
183 dactivations[_activations_index[_nb_layers - 1] + i] = 2 * (_activations[_activations_index[_nb_layers - 1] + i] - correct);
186 for(int l = _nb_layers - 2; (l >= 0) && (nb_unfrozen > 0); l--) {
187 int ai0 = _activations_index[l], ai1 = _activations_index[l+1],
188 wi0 = _weights_index[l], ls0p1 = _layer_sizes[l] + 1;
191 for(j = 0; j < _layer_sizes[l]; j++) {
193 for(int i = 0; i < _layer_sizes[l+1]; i++) {
194 scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
195 s += alpha * _weights[wi0 + i * ls0p1 + j];
196 gradient_1s[wi0 + i * ls0p1 + j] = alpha * _activations[ai0 + j];
198 dactivations[ai0 + j] = s;
201 for(int i = 0; i < _layer_sizes[l+1]; i++) {
202 scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
203 gradient_1s[wi0 + i * ls0p1 + j] = alpha;
205 if(!_frozen_layers[l]) nb_unfrozen--;
209 void MultiLayerPerceptron::compute_gradient(ImageSet *is, scalar_t *gradient) {
210 scalar_t gradient_1s[_nb_weights];
211 for(int w = 0; w < _nb_weights; w++) gradient[w] = 0.0;
212 for(int p = 0; p < is->nb_pics(); p++) {
213 compute_gradient_1s(is, p, gradient_1s);
214 for(int w = 0; w < _nb_weights; w++) gradient[w] += gradient_1s[w];
218 void MultiLayerPerceptron::compute_numerical_gradient(ImageSet *is, scalar_t *gradient) {
219 const scalar_t eps = 1e-3;
220 scalar_t error_plus, error_minus, ref;
221 for(int w = 0; w < _nb_weights; w++) {
223 _weights[w] = ref + eps;
224 error_plus = error(is);
225 _weights[w] = ref - eps;
226 error_minus = error(is);
228 gradient[w] = (error_plus - error_minus) / (2 * eps);
232 void MultiLayerPerceptron::print_gradient(ostream &os, scalar_t *gradient) {
233 for(int w = 0; w < _nb_weights; w++) os << gradient[w] << "\n";
236 void MultiLayerPerceptron::move_on_line(scalar_t *origin, scalar_t *gradient, scalar_t lambda) {
237 for(int l = 0; l < _nb_layers-1; l++) if(!_frozen_layers[l]) {
238 for(int i = 0; i < (_layer_sizes[l] + 1) * _layer_sizes[l+1]; i++)
239 _weights[_weights_index[l] + i] =
240 origin[_weights_index[l] + i] + lambda * gradient[_weights_index[l] + i];
244 void MultiLayerPerceptron::one_step_basic_gradient(ImageSet *is, scalar_t dt) {
245 scalar_t gradient_1s[_nb_weights];
246 for(int p = 0; p < is->nb_pics(); p++) {
248 int n = (p*50)/is->nb_pics();
250 for(j = 0; j < n; j++) cout << "X";
251 for(; j < 50; j++) cout << ((j%5 == 0) ? "+" : "-");
255 compute_gradient_1s(is, p, gradient_1s);
256 move_on_line(_weights, gradient_1s, -dt);
258 cout << " \r"; cout.flush();
261 void MultiLayerPerceptron::one_step_global_gradient(ImageSet *is, scalar_t *xi, scalar_t *g, scalar_t *h) {
262 scalar_t origin[_nb_weights];
263 for(int w = 0; w < _nb_weights; w++) origin[w] = _weights[w];
273 move_on_line(origin, xi, l);
277 scalar_t dl = - l / 4;
279 while(abs(dl) > 1e-6) {
280 move_on_line(origin, xi, l);
285 move_on_line(origin, xi, l);
291 compute_gradient(is, xi);
293 scalar_t gg = 0, gxi = 0, xixi = 0;
297 for(int w = 0; w < _nb_weights; w++) {
303 scalar_t gamma = (xixi + gxi)/gg;
305 // Set gamma to 0 for standard gradient descente
308 for(int w = 0; w < _nb_weights; w++) {
310 h[w] = g[w] + gamma * h[w];
315 void MultiLayerPerceptron::train(ImageSet *training_set, ImageSet *validation_set) {
316 scalar_t prev_validation_error = 1.0, validation_error = 1.0, training_error;
317 int l = 0, nb_increases = 0;
319 // #warning horrible debugging
321 // char buffer[1024];
322 // sprintf(buffer, "tmp_%04d.mlp", l);
323 // ofstream stream(buffer);
327 prev_validation_error = validation_error;
328 // one_step_global_gradient(training_set, xi, g, h);
329 one_step_basic_gradient(training_set, 1e-2);
330 training_error = error(training_set);
331 validation_error = error(validation_set);
333 << " TRAINING " << training_error << " (" << classification_error(training_set)*100 << "%)"
334 << " VALIDATION " << validation_error << " (" << classification_error(validation_set)*100 << "%)";
335 if(l > 0 && validation_error >= prev_validation_error) {
337 cout << " [" << nb_increases << "]";
342 } while(nb_increases < 10);
346 void MultiLayerPerceptron::compute_activations_1s(ImageSet *is, int p) {
347 ASSERT(_layer_sizes[0] == is->width() * is->height(), "The first layer has to have as many units as there are pixels!");
349 scalar_t *a = _activations;
350 scalar_t *w = _weights;
352 for(int y = 0; y < is->height(); y++) for(int x = 0; x < is->width(); x++)
353 *(a++) = 2.0 * (scalar_t(is->pixel(p, x, y)) / 255.0) - 1.0;
355 scalar_t *pa = _pre_sigma_activations + _activations_index[1];
356 scalar_t *b = _activations, *b2 = 0;
358 for(int l = 0; l < _nb_layers - 1; l++) {
359 for(int i = 0; i < _layer_sizes[l+1]; i++) {
362 for(int j = 0; j < _layer_sizes[l]; j++) s += *(w++) * *(b2++);
371 void MultiLayerPerceptron::test(ImageSet *is, scalar_t *responses) {
372 for(int p = 0; p < is->nb_pics(); p++) {
373 compute_activations_1s(is, p);
374 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++)
375 responses[p * _layer_sizes[_nb_layers - 1] + i] = _activations[_activations_index[_nb_layers - 1] + i];
379 scalar_t MultiLayerPerceptron::error(ImageSet *is) {
382 for(int p = 0; p < is->nb_pics(); p++) {
383 compute_activations_1s(is, p);
384 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
386 if(is->label(p) == i) correct = output_amplitude;
387 else correct = - output_amplitude;
388 error += sq(_activations[_activations_index[_nb_layers - 1] + i] - correct);
395 scalar_t MultiLayerPerceptron::classification_error(ImageSet *is) {
398 for(int p = 0; p < is->nb_pics(); p++) {
399 compute_activations_1s(is, p);
402 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
403 if(i_max < 0 || _activations[_activations_index[_nb_layers - 1] + i] > max) {
405 max = _activations[_activations_index[_nb_layers - 1] + i];
408 if(is->label(p) != i_max) nb_error++;
411 return scalar_t(nb_error)/scalar_t(is->nb_pics());