2 * mlp-mnist is an implementation of a multi-layer neural network.
4 * Copyright (c) 2006 École Polytechnique Fédérale de Lausanne,
7 * Written by Francois Fleuret <francois@fleuret.org>
9 * This file is part of mlp-mnist.
11 * mlp-mnist is free software: you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License version 3 as
13 * published by the Free Software Foundation.
15 * mlp-mnist is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with mlp-mnist. If not, see <http://www.gnu.org/licenses/>.
29 const scalar_t MultiLayerPerceptron::output_amplitude = 0.95;
31 // I won't comment the natural elegance of C++ in that kind of
32 // situation. IT SUCKS.
34 MultiLayerPerceptron::MultiLayerPerceptron(const MultiLayerPerceptron &mlp) {
35 _nb_layers = mlp._nb_layers;
36 _layer_sizes = new int[_nb_layers];
37 _frozen_layers = new bool[_nb_layers-1];
38 memcpy((void *) _frozen_layers, (void *) mlp._frozen_layers, _nb_layers * sizeof(bool));
39 _activations_index = new int[_nb_layers];
40 _weights_index = new int[_nb_layers];
45 for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = mlp._layer_sizes[n];
47 for(int n = 0; n < _nb_layers; n++) {
48 _activations_index[n] = _nb_activations;
49 _nb_activations += _layer_sizes[n];
50 if(n < _nb_layers - 1) {
51 _frozen_layers[n] = false;
52 _weights_index[n] = _nb_weights;
53 _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
57 _activations = new scalar_t[_nb_activations];
58 _pre_sigma_activations = new scalar_t[_nb_activations];
60 _weights = new scalar_t[_nb_weights];
61 memcpy((void *) _weights, (void *) mlp._weights, _nb_weights * sizeof(scalar_t));
64 MultiLayerPerceptron::MultiLayerPerceptron(int nb_layers, int *layer_sizes) {
65 _nb_layers = nb_layers;
66 _layer_sizes = new int[_nb_layers];
67 _frozen_layers = new bool[_nb_layers-1];
68 _activations_index = new int[_nb_layers];
69 _weights_index = new int[_nb_layers];
74 for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = layer_sizes[n];
76 for(int n = 0; n < _nb_layers; n++) {
77 _activations_index[n] = _nb_activations;
78 _nb_activations += _layer_sizes[n];
79 if(n < _nb_layers - 1) {
80 _frozen_layers[n] = false;
81 _weights_index[n] = _nb_weights;
82 _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
86 _activations = new scalar_t[_nb_activations];
87 _pre_sigma_activations = new scalar_t[_nb_activations];
89 _weights = new scalar_t[_nb_weights];
92 MultiLayerPerceptron::MultiLayerPerceptron(istream &is) {
95 _layer_sizes = new int[_nb_layers];
96 _frozen_layers = new bool[_nb_layers - 1];
97 _activations_index = new int[_nb_layers];
98 _weights_index = new int[_nb_layers];
103 for(int n = 0; n < _nb_layers; n++) is >> _layer_sizes[n];
105 for(int n = 0; n < _nb_layers; n++) {
106 _activations_index[n] = _nb_activations;
107 _nb_activations += _layer_sizes[n];
108 if(n < _nb_layers - 1) {
109 _frozen_layers[n] = false;
110 _weights_index[n] = _nb_weights;
111 _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
115 _activations = new scalar_t[_nb_activations];
116 _pre_sigma_activations = new scalar_t[_nb_activations];
118 _weights = new scalar_t[_nb_weights];
120 for(int l = 0; l < _nb_layers - 1; l++) {
124 cerr << "Inconsistent layer number during loading!\n";
128 for(int j = 0; j < _layer_sizes[l]; j++)
129 for(int i = 0; i < _layer_sizes[l+1]; i++)
130 is >> _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j];
134 MultiLayerPerceptron::~MultiLayerPerceptron() {
136 delete[] _activations;
137 delete[] _pre_sigma_activations;
139 delete[] _layer_sizes;
140 delete[] _frozen_layers;
141 delete[] _weights_index;
142 delete[] _activations_index;
145 void MultiLayerPerceptron::save(ostream &os) {
146 os << _nb_layers << "\n";
147 for(int n = 0; n < _nb_layers; n++) os << _layer_sizes[n] << (n < _nb_layers - 1 ? " " : "\n");
148 for(int l = 0; l < _nb_layers - 1; l++) {
150 for(int j = 0; j < _layer_sizes[l]; j++) {
151 for(int i = 0; i < _layer_sizes[l+1]; i++)
152 os << _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j] << (i < _layer_sizes[l+1] - 1 ? " " : "\n");
157 void MultiLayerPerceptron::save_data() {
158 for(int i = 0; i < _layer_sizes[1]; i++) {
160 sprintf(buffer, "/tmp/hidden_%03d.dat", i);
161 ofstream stream(buffer);
162 for(int j = 0; j < _layer_sizes[0]; j++) {
163 if(j%28 == 0) stream << "\n";
164 stream << j%28 << " " << j/28 << " " << _weights[i * (_layer_sizes[0] + 1) + j] << "\n";
169 void MultiLayerPerceptron::init_random_weights(scalar_t stdd) {
170 for(int w = 0; w < _nb_weights; w++) _weights[w] = normal_sample() * stdd;
173 void MultiLayerPerceptron::compute_gradient_1s(ImageSet *is, int p, scalar_t *gradient_1s) {
174 scalar_t dactivations[_nb_activations];
176 compute_activations_1s(is, p);
179 for(int l = 0; l < _nb_layers - 1; l++) if(!_frozen_layers[l]) nb_unfrozen++;
181 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
183 if(is->label(p) == i) correct = output_amplitude;
184 else correct = - output_amplitude;
185 dactivations[_activations_index[_nb_layers - 1] + i] = 2 * (_activations[_activations_index[_nb_layers - 1] + i] - correct);
188 for(int l = _nb_layers - 2; (l >= 0) && (nb_unfrozen > 0); l--) {
189 int ai0 = _activations_index[l], ai1 = _activations_index[l+1],
190 wi0 = _weights_index[l], ls0p1 = _layer_sizes[l] + 1;
193 for(j = 0; j < _layer_sizes[l]; j++) {
195 for(int i = 0; i < _layer_sizes[l+1]; i++) {
196 scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
197 s += alpha * _weights[wi0 + i * ls0p1 + j];
198 gradient_1s[wi0 + i * ls0p1 + j] = alpha * _activations[ai0 + j];
200 dactivations[ai0 + j] = s;
203 for(int i = 0; i < _layer_sizes[l+1]; i++) {
204 scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
205 gradient_1s[wi0 + i * ls0p1 + j] = alpha;
207 if(!_frozen_layers[l]) nb_unfrozen--;
211 void MultiLayerPerceptron::compute_gradient(ImageSet *is, scalar_t *gradient) {
212 scalar_t gradient_1s[_nb_weights];
213 for(int w = 0; w < _nb_weights; w++) gradient[w] = 0.0;
214 for(int p = 0; p < is->nb_pics(); p++) {
215 compute_gradient_1s(is, p, gradient_1s);
216 for(int w = 0; w < _nb_weights; w++) gradient[w] += gradient_1s[w];
220 void MultiLayerPerceptron::compute_numerical_gradient(ImageSet *is, scalar_t *gradient) {
221 const scalar_t eps = 1e-3;
222 scalar_t error_plus, error_minus, ref;
223 for(int w = 0; w < _nb_weights; w++) {
225 _weights[w] = ref + eps;
226 error_plus = error(is);
227 _weights[w] = ref - eps;
228 error_minus = error(is);
230 gradient[w] = (error_plus - error_minus) / (2 * eps);
234 void MultiLayerPerceptron::print_gradient(ostream &os, scalar_t *gradient) {
235 for(int w = 0; w < _nb_weights; w++) os << gradient[w] << "\n";
238 void MultiLayerPerceptron::move_on_line(scalar_t *origin, scalar_t *gradient, scalar_t lambda) {
239 for(int l = 0; l < _nb_layers-1; l++) if(!_frozen_layers[l]) {
240 for(int i = 0; i < (_layer_sizes[l] + 1) * _layer_sizes[l+1]; i++)
241 _weights[_weights_index[l] + i] =
242 origin[_weights_index[l] + i] + lambda * gradient[_weights_index[l] + i];
246 void MultiLayerPerceptron::one_step_basic_gradient(ImageSet *is, scalar_t dt) {
247 scalar_t gradient_1s[_nb_weights];
248 for(int p = 0; p < is->nb_pics(); p++) {
250 int n = (p*50)/is->nb_pics();
252 for(j = 0; j < n; j++) cout << "X";
253 for(; j < 50; j++) cout << ((j%5 == 0) ? "+" : "-");
257 compute_gradient_1s(is, p, gradient_1s);
258 move_on_line(_weights, gradient_1s, -dt);
260 cout << " \r"; cout.flush();
263 void MultiLayerPerceptron::one_step_global_gradient(ImageSet *is, scalar_t *xi, scalar_t *g, scalar_t *h) {
264 scalar_t origin[_nb_weights];
265 for(int w = 0; w < _nb_weights; w++) origin[w] = _weights[w];
275 move_on_line(origin, xi, l);
279 scalar_t dl = - l / 4;
281 while(abs(dl) > 1e-6) {
282 move_on_line(origin, xi, l);
287 move_on_line(origin, xi, l);
293 compute_gradient(is, xi);
295 scalar_t gg = 0, gxi = 0, xixi = 0;
299 for(int w = 0; w < _nb_weights; w++) {
305 scalar_t gamma = (xixi + gxi)/gg;
307 // Set gamma to 0 for standard gradient descente
310 for(int w = 0; w < _nb_weights; w++) {
312 h[w] = g[w] + gamma * h[w];
317 void MultiLayerPerceptron::train(ImageSet *training_set, ImageSet *validation_set) {
318 scalar_t prev_validation_error = 1.0, validation_error = 1.0, training_error;
319 int l = 0, nb_increases = 0;
321 // #warning horrible debugging
323 // char buffer[1024];
324 // sprintf(buffer, "tmp_%04d.mlp", l);
325 // ofstream stream(buffer);
329 prev_validation_error = validation_error;
330 // one_step_global_gradient(training_set, xi, g, h);
331 one_step_basic_gradient(training_set, 1e-2);
332 training_error = error(training_set);
333 validation_error = error(validation_set);
335 << " TRAINING " << training_error << " (" << classification_error(training_set)*100 << "%)"
336 << " VALIDATION " << validation_error << " (" << classification_error(validation_set)*100 << "%)";
337 if(l > 0 && validation_error >= prev_validation_error) {
339 cout << " [" << nb_increases << "]";
344 } while(nb_increases < 10);
348 void MultiLayerPerceptron::compute_activations_1s(ImageSet *is, int p) {
349 ASSERT(_layer_sizes[0] == is->width() * is->height(), "The first layer has to have as many units as there are pixels!");
351 scalar_t *a = _activations;
352 scalar_t *w = _weights;
354 for(int y = 0; y < is->height(); y++) for(int x = 0; x < is->width(); x++)
355 *(a++) = 2.0 * (scalar_t(is->pixel(p, x, y)) / 255.0) - 1.0;
357 scalar_t *pa = _pre_sigma_activations + _activations_index[1];
358 scalar_t *b = _activations, *b2 = 0;
360 for(int l = 0; l < _nb_layers - 1; l++) {
361 for(int i = 0; i < _layer_sizes[l+1]; i++) {
364 for(int j = 0; j < _layer_sizes[l]; j++) s += *(w++) * *(b2++);
373 void MultiLayerPerceptron::test(ImageSet *is, scalar_t *responses) {
374 for(int p = 0; p < is->nb_pics(); p++) {
375 compute_activations_1s(is, p);
376 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++)
377 responses[p * _layer_sizes[_nb_layers - 1] + i] = _activations[_activations_index[_nb_layers - 1] + i];
381 scalar_t MultiLayerPerceptron::error(ImageSet *is) {
384 for(int p = 0; p < is->nb_pics(); p++) {
385 compute_activations_1s(is, p);
386 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
388 if(is->label(p) == i) correct = output_amplitude;
389 else correct = - output_amplitude;
390 error += sq(_activations[_activations_index[_nb_layers - 1] + i] - correct);
397 scalar_t MultiLayerPerceptron::classification_error(ImageSet *is) {
400 for(int p = 0; p < is->nb_pics(); p++) {
401 compute_activations_1s(is, p);
404 for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
405 if(i_max < 0 || _activations[_activations_index[_nb_layers - 1] + i] > max) {
407 max = _activations[_activations_index[_nb_layers - 1] + i];
410 if(is->label(p) != i_max) nb_error++;
413 return scalar_t(nb_error)/scalar_t(is->nb_pics());