--- /dev/null
+/*
+ * mlp-mnist is an implementation of a multi-layer neural network.
+ *
+ * Copyright (c) 2008 Idiap Research Institute, http://www.idiap.ch/
+ * Written by Francois Fleuret <francois.fleuret@idiap.ch>
+ *
+ * This file is part of mlp-mnist.
+ *
+ * mlp-mnist is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * mlp-mnist is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with mlp-mnist. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <string.h>
+
+#include "neural.h"
+
+const scalar_t MultiLayerPerceptron::output_amplitude = 0.95;
+
+// I won't comment the natural elegance of C++ in that kind of
+// situation. IT SUCKS.
+
+MultiLayerPerceptron::MultiLayerPerceptron(const MultiLayerPerceptron &mlp) {
+ _nb_layers = mlp._nb_layers;
+ _layer_sizes = new int[_nb_layers];
+ _frozen_layers = new bool[_nb_layers-1];
+ memcpy((void *) _frozen_layers, (void *) mlp._frozen_layers, _nb_layers * sizeof(bool));
+ _activations_index = new int[_nb_layers];
+ _weights_index = new int[_nb_layers];
+
+ _nb_activations = 0;
+ _nb_weights = 0;
+
+ for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = mlp._layer_sizes[n];
+
+ for(int n = 0; n < _nb_layers; n++) {
+ _activations_index[n] = _nb_activations;
+ _nb_activations += _layer_sizes[n];
+ if(n < _nb_layers - 1) {
+ _frozen_layers[n] = false;
+ _weights_index[n] = _nb_weights;
+ _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
+ }
+ }
+
+ _activations = new scalar_t[_nb_activations];
+ _pre_sigma_activations = new scalar_t[_nb_activations];
+
+ _weights = new scalar_t[_nb_weights];
+ memcpy((void *) _weights, (void *) mlp._weights, _nb_weights * sizeof(scalar_t));
+}
+
+MultiLayerPerceptron::MultiLayerPerceptron(int nb_layers, int *layer_sizes) {
+ _nb_layers = nb_layers;
+ _layer_sizes = new int[_nb_layers];
+ _frozen_layers = new bool[_nb_layers-1];
+ _activations_index = new int[_nb_layers];
+ _weights_index = new int[_nb_layers];
+
+ _nb_activations = 0;
+ _nb_weights = 0;
+
+ for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = layer_sizes[n];
+
+ for(int n = 0; n < _nb_layers; n++) {
+ _activations_index[n] = _nb_activations;
+ _nb_activations += _layer_sizes[n];
+ if(n < _nb_layers - 1) {
+ _frozen_layers[n] = false;
+ _weights_index[n] = _nb_weights;
+ _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
+ }
+ }
+
+ _activations = new scalar_t[_nb_activations];
+ _pre_sigma_activations = new scalar_t[_nb_activations];
+
+ _weights = new scalar_t[_nb_weights];
+}
+
+MultiLayerPerceptron::MultiLayerPerceptron(istream &is) {
+ is >> _nb_layers;
+
+ _layer_sizes = new int[_nb_layers];
+ _frozen_layers = new bool[_nb_layers - 1];
+ _activations_index = new int[_nb_layers];
+ _weights_index = new int[_nb_layers];
+
+ _nb_activations = 0;
+ _nb_weights = 0;
+
+ for(int n = 0; n < _nb_layers; n++) is >> _layer_sizes[n];
+
+ for(int n = 0; n < _nb_layers; n++) {
+ _activations_index[n] = _nb_activations;
+ _nb_activations += _layer_sizes[n];
+ if(n < _nb_layers - 1) {
+ _frozen_layers[n] = false;
+ _weights_index[n] = _nb_weights;
+ _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
+ }
+ }
+
+ _activations = new scalar_t[_nb_activations];
+ _pre_sigma_activations = new scalar_t[_nb_activations];
+
+ _weights = new scalar_t[_nb_weights];
+
+ for(int l = 0; l < _nb_layers - 1; l++) {
+ int ll;
+ is >> ll;
+ if(l != ll) {
+ cerr << "Inconsistent layer number during loading!\n";
+ cerr.flush();
+ exit(1);
+ }
+ for(int j = 0; j < _layer_sizes[l]; j++)
+ for(int i = 0; i < _layer_sizes[l+1]; i++)
+ is >> _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j];
+ }
+}
+
+MultiLayerPerceptron::~MultiLayerPerceptron() {
+ delete[] _weights;
+ delete[] _activations;
+ delete[] _pre_sigma_activations;
+
+ delete[] _layer_sizes;
+ delete[] _frozen_layers;
+ delete[] _weights_index;
+ delete[] _activations_index;
+}
+
+void MultiLayerPerceptron::save(ostream &os) {
+ os << _nb_layers << "\n";
+ for(int n = 0; n < _nb_layers; n++) os << _layer_sizes[n] << (n < _nb_layers - 1 ? " " : "\n");
+ for(int l = 0; l < _nb_layers - 1; l++) {
+ os << l << "\n";
+ for(int j = 0; j < _layer_sizes[l]; j++) {
+ for(int i = 0; i < _layer_sizes[l+1]; i++)
+ os << _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j] << (i < _layer_sizes[l+1] - 1 ? " " : "\n");
+ }
+ }
+}
+
+void MultiLayerPerceptron::save_data() {
+ for(int i = 0; i < _layer_sizes[1]; i++) {
+ char buffer[256];
+ sprintf(buffer, "/tmp/hidden_%03d.dat", i);
+ ofstream stream(buffer);
+ for(int j = 0; j < _layer_sizes[0]; j++) {
+ if(j%28 == 0) stream << "\n";
+ stream << j%28 << " " << j/28 << " " << _weights[i * (_layer_sizes[0] + 1) + j] << "\n";
+ }
+ }
+}
+
+void MultiLayerPerceptron::init_random_weights(scalar_t stdd) {
+ for(int w = 0; w < _nb_weights; w++) _weights[w] = normal_sample() * stdd;
+}
+
+void MultiLayerPerceptron::compute_gradient_1s(ImageSet *is, int p, scalar_t *gradient_1s) {
+ scalar_t dactivations[_nb_activations];
+
+ compute_activations_1s(is, p);
+
+ int nb_unfrozen = 0;
+ for(int l = 0; l < _nb_layers - 1; l++) if(!_frozen_layers[l]) nb_unfrozen++;
+
+ for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
+ scalar_t correct;
+ if(is->label(p) == i) correct = output_amplitude;
+ else correct = - output_amplitude;
+ dactivations[_activations_index[_nb_layers - 1] + i] = 2 * (_activations[_activations_index[_nb_layers - 1] + i] - correct);
+ }
+
+ for(int l = _nb_layers - 2; (l >= 0) && (nb_unfrozen > 0); l--) {
+ int ai0 = _activations_index[l], ai1 = _activations_index[l+1],
+ wi0 = _weights_index[l], ls0p1 = _layer_sizes[l] + 1;
+
+ int j;
+ for(j = 0; j < _layer_sizes[l]; j++) {
+ scalar_t s = 0.0;
+ for(int i = 0; i < _layer_sizes[l+1]; i++) {
+ scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
+ s += alpha * _weights[wi0 + i * ls0p1 + j];
+ gradient_1s[wi0 + i * ls0p1 + j] = alpha * _activations[ai0 + j];
+ }
+ dactivations[ai0 + j] = s;
+ }
+
+ for(int i = 0; i < _layer_sizes[l+1]; i++) {
+ scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
+ gradient_1s[wi0 + i * ls0p1 + j] = alpha;
+ }
+ if(!_frozen_layers[l]) nb_unfrozen--;
+ }
+}
+
+void MultiLayerPerceptron::compute_gradient(ImageSet *is, scalar_t *gradient) {
+ scalar_t gradient_1s[_nb_weights];
+ for(int w = 0; w < _nb_weights; w++) gradient[w] = 0.0;
+ for(int p = 0; p < is->nb_pics(); p++) {
+ compute_gradient_1s(is, p, gradient_1s);
+ for(int w = 0; w < _nb_weights; w++) gradient[w] += gradient_1s[w];
+ }
+}
+
+void MultiLayerPerceptron::compute_numerical_gradient(ImageSet *is, scalar_t *gradient) {
+ const scalar_t eps = 1e-3;
+ scalar_t error_plus, error_minus, ref;
+ for(int w = 0; w < _nb_weights; w++) {
+ ref = _weights[w];
+ _weights[w] = ref + eps;
+ error_plus = error(is);
+ _weights[w] = ref - eps;
+ error_minus = error(is);
+ _weights[w] = ref;
+ gradient[w] = (error_plus - error_minus) / (2 * eps);
+ }
+}
+
+void MultiLayerPerceptron::print_gradient(ostream &os, scalar_t *gradient) {
+ for(int w = 0; w < _nb_weights; w++) os << gradient[w] << "\n";
+}
+
+void MultiLayerPerceptron::move_on_line(scalar_t *origin, scalar_t *gradient, scalar_t lambda) {
+ for(int l = 0; l < _nb_layers-1; l++) if(!_frozen_layers[l]) {
+ for(int i = 0; i < (_layer_sizes[l] + 1) * _layer_sizes[l+1]; i++)
+ _weights[_weights_index[l] + i] =
+ origin[_weights_index[l] + i] + lambda * gradient[_weights_index[l] + i];
+ }
+}
+
+void MultiLayerPerceptron::one_step_basic_gradient(ImageSet *is, scalar_t dt) {
+ scalar_t gradient_1s[_nb_weights];
+ for(int p = 0; p < is->nb_pics(); p++) {
+ if(p%1000 == 0) {
+ int n = (p*50)/is->nb_pics();
+ int j;
+ for(j = 0; j < n; j++) cout << "X";
+ for(; j < 50; j++) cout << ((j%5 == 0) ? "+" : "-");
+ cout << "\r";
+ cout.flush();
+ }
+ compute_gradient_1s(is, p, gradient_1s);
+ move_on_line(_weights, gradient_1s, -dt);
+ }
+ cout << " \r"; cout.flush();
+}
+
+void MultiLayerPerceptron::one_step_global_gradient(ImageSet *is, scalar_t *xi, scalar_t *g, scalar_t *h) {
+ scalar_t origin[_nb_weights];
+ for(int w = 0; w < _nb_weights; w++) origin[w] = _weights[w];
+
+ scalar_t l = 1e-8;
+ scalar_t e, pe;
+
+ e = error(is);
+
+ do {
+ pe = e;
+ l *= 2;
+ move_on_line(origin, xi, l);
+ e = error(is);
+ } while(e < pe);
+
+ scalar_t dl = - l / 4;
+
+ while(abs(dl) > 1e-6) {
+ move_on_line(origin, xi, l);
+ e = error(is);
+ do {
+ pe = e;
+ l += dl;
+ move_on_line(origin, xi, l);
+ e = error(is);
+ } while(e < pe);
+ dl = - dl / 4;
+ }
+
+ compute_gradient(is, xi);
+
+ scalar_t gg = 0, gxi = 0, xixi = 0;
+
+ // Polak-Ribiere
+
+ for(int w = 0; w < _nb_weights; w++) {
+ gg += sq(g[w]);
+ gxi += g[w] * xi[w];
+ xixi += sq(xi[w]);
+ }
+
+ scalar_t gamma = (xixi + gxi)/gg;
+
+ // Set gamma to 0 for standard gradient descente
+ // gamma = 0.0;
+
+ for(int w = 0; w < _nb_weights; w++) {
+ g[w] = -xi[w];
+ h[w] = g[w] + gamma * h[w];
+ xi[w] = h[w];
+ }
+}
+
+void MultiLayerPerceptron::train(ImageSet *training_set, ImageSet *validation_set) {
+ scalar_t prev_validation_error = 1.0, validation_error = 1.0, training_error;
+ int l = 0, nb_increases = 0;
+ do {
+// #warning horrible debugging
+// {
+// char buffer[1024];
+// sprintf(buffer, "tmp_%04d.mlp", l);
+// ofstream stream(buffer);
+// save(stream);
+// stream.flush();
+// }
+ prev_validation_error = validation_error;
+ // one_step_global_gradient(training_set, xi, g, h);
+ one_step_basic_gradient(training_set, 1e-2);
+ training_error = error(training_set);
+ validation_error = error(validation_set);
+ cout << l
+ << " TRAINING " << training_error << " (" << classification_error(training_set)*100 << "%)"
+ << " VALIDATION " << validation_error << " (" << classification_error(validation_set)*100 << "%)";
+ if(l > 0 && validation_error >= prev_validation_error) {
+ nb_increases++;
+ cout << " [" << nb_increases << "]";
+ }
+ cout << "\n";
+ cout.flush();
+ l++;
+ } while(nb_increases < 10);
+
+}
+
+void MultiLayerPerceptron::compute_activations_1s(ImageSet *is, int p) {
+ ASSERT(_layer_sizes[0] == is->width() * is->height(), "The first layer has to have as many units as there are pixels!");
+
+ scalar_t *a = _activations;
+ scalar_t *w = _weights;
+
+ for(int y = 0; y < is->height(); y++) for(int x = 0; x < is->width(); x++)
+ *(a++) = 2.0 * (scalar_t(is->pixel(p, x, y)) / 255.0) - 1.0;
+
+ scalar_t *pa = _pre_sigma_activations + _activations_index[1];
+ scalar_t *b = _activations, *b2 = 0;
+
+ for(int l = 0; l < _nb_layers - 1; l++) {
+ for(int i = 0; i < _layer_sizes[l+1]; i++) {
+ scalar_t s = 0;
+ b2 = b;
+ for(int j = 0; j < _layer_sizes[l]; j++) s += *(w++) * *(b2++);
+ s += *(w++);
+ *(pa++) = s;
+ *(a++) = sigma(s);
+ }
+ b = b2;
+ }
+}
+
+void MultiLayerPerceptron::test(ImageSet *is, scalar_t *responses) {
+ for(int p = 0; p < is->nb_pics(); p++) {
+ compute_activations_1s(is, p);
+ for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++)
+ responses[p * _layer_sizes[_nb_layers - 1] + i] = _activations[_activations_index[_nb_layers - 1] + i];
+ }
+}
+
+scalar_t MultiLayerPerceptron::error(ImageSet *is) {
+ scalar_t error = 0;
+
+ for(int p = 0; p < is->nb_pics(); p++) {
+ compute_activations_1s(is, p);
+ for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
+ scalar_t correct;
+ if(is->label(p) == i) correct = output_amplitude;
+ else correct = - output_amplitude;
+ error += sq(_activations[_activations_index[_nb_layers - 1] + i] - correct);
+ }
+ }
+
+ return error;
+}
+
+scalar_t MultiLayerPerceptron::classification_error(ImageSet *is) {
+ int nb_error = 0;
+
+ for(int p = 0; p < is->nb_pics(); p++) {
+ compute_activations_1s(is, p);
+ scalar_t max = -1;
+ int i_max = -1;
+ for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
+ if(i_max < 0 || _activations[_activations_index[_nb_layers - 1] + i] > max) {
+ i_max = i;
+ max = _activations[_activations_index[_nb_layers - 1] + i];
+ }
+ }
+ if(is->label(p) != i_max) nb_error++;
+ }
+
+ return scalar_t(nb_error)/scalar_t(is->nb_pics());
+}