neural.cc

   1 /*
   2  *  mlp-mnist is an implementation of a multi-layer neural network.
   3  *
   4  *  Copyright (c) 2008 Idiap Research Institute, http://www.idiap.ch/
   5  *  Written by Francois Fleuret <francois.fleuret@idiap.ch>
   6  *
   7  *  This file is part of mlp-mnist.
   8  *
   9  *  mlp-mnist is free software: you can redistribute it and/or modify
  10  *  it under the terms of the GNU General Public License version 3 as
  11  *  published by the Free Software Foundation.
  12  *
  13  *  mlp-mnist is distributed in the hope that it will be useful, but
  14  *  WITHOUT ANY WARRANTY; without even the implied warranty of
  15  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16  *  General Public License for more details.
  17  *
  18  *  You should have received a copy of the GNU General Public License
  19  *  along with mlp-mnist.  If not, see <http://www.gnu.org/licenses/>.
  20  *
  21  */
  22
  23 #include <string.h>
  24
  25 #include "neural.h"
  26
  27 const scalar_t MultiLayerPerceptron::output_amplitude = 0.95;
  28
  29 // I won't comment the natural elegance of C++ in that kind of
  30 // situation. IT SUCKS.
  31
  32 MultiLayerPerceptron::MultiLayerPerceptron(const MultiLayerPerceptron &mlp) {
  33   _nb_layers = mlp._nb_layers;
  34   _layer_sizes = new int[_nb_layers];
  35   _frozen_layers = new bool[_nb_layers-1];
  36   memcpy((void *) _frozen_layers, (void *) mlp._frozen_layers, _nb_layers * sizeof(bool));
  37   _activations_index = new int[_nb_layers];
  38   _weights_index = new int[_nb_layers];
  39
  40   _nb_activations = 0;
  41   _nb_weights = 0;
  42
  43   for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = mlp._layer_sizes[n];
  44
  45   for(int n = 0; n < _nb_layers; n++) {
  46     _activations_index[n] = _nb_activations;
  47     _nb_activations += _layer_sizes[n];
  48     if(n < _nb_layers - 1) {
  49       _frozen_layers[n] = false;
  50       _weights_index[n] = _nb_weights;
  51       _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
  52     }
  53   }
  54
  55   _activations = new scalar_t[_nb_activations];
  56   _pre_sigma_activations = new scalar_t[_nb_activations];
  57
  58   _weights = new scalar_t[_nb_weights];
  59   memcpy((void *) _weights, (void *) mlp._weights, _nb_weights * sizeof(scalar_t));
  60 }
  61
  62 MultiLayerPerceptron::MultiLayerPerceptron(int nb_layers, int *layer_sizes) {
  63   _nb_layers = nb_layers;
  64   _layer_sizes = new int[_nb_layers];
  65   _frozen_layers = new bool[_nb_layers-1];
  66   _activations_index = new int[_nb_layers];
  67   _weights_index = new int[_nb_layers];
  68
  69   _nb_activations = 0;
  70   _nb_weights = 0;
  71
  72   for(int n = 0; n < _nb_layers; n++) _layer_sizes[n] = layer_sizes[n];
  73
  74   for(int n = 0; n < _nb_layers; n++) {
  75     _activations_index[n] = _nb_activations;
  76     _nb_activations += _layer_sizes[n];
  77     if(n < _nb_layers - 1) {
  78       _frozen_layers[n] = false;
  79       _weights_index[n] = _nb_weights;
  80       _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
  81     }
  82   }
  83
  84   _activations = new scalar_t[_nb_activations];
  85   _pre_sigma_activations = new scalar_t[_nb_activations];
  86
  87   _weights = new scalar_t[_nb_weights];
  88 }
  89
  90 MultiLayerPerceptron::MultiLayerPerceptron(istream &is) {
  91   is >> _nb_layers;
  92
  93   _layer_sizes = new int[_nb_layers];
  94   _frozen_layers = new bool[_nb_layers - 1];
  95   _activations_index = new int[_nb_layers];
  96   _weights_index = new int[_nb_layers];
  97
  98   _nb_activations = 0;
  99   _nb_weights = 0;
 100
 101   for(int n = 0; n < _nb_layers; n++) is >> _layer_sizes[n];
 102
 103   for(int n = 0; n < _nb_layers; n++) {
 104     _activations_index[n] = _nb_activations;
 105     _nb_activations += _layer_sizes[n];
 106     if(n < _nb_layers - 1) {
 107       _frozen_layers[n] = false;
 108       _weights_index[n] = _nb_weights;
 109       _nb_weights += (_layer_sizes[n] + 1) * _layer_sizes[n+1];
 110     }
 111   }
 112
 113   _activations = new scalar_t[_nb_activations];
 114   _pre_sigma_activations = new scalar_t[_nb_activations];
 115
 116   _weights = new scalar_t[_nb_weights];
 117
 118   for(int l = 0; l < _nb_layers - 1; l++) {
 119     int ll;
 120     is >> ll;
 121     if(l != ll) {
 122       cerr << "Inconsistent layer number during loading!\n";
 123       cerr.flush();
 124       exit(1);
 125     }
 126     for(int j = 0; j < _layer_sizes[l]; j++)
 127       for(int i = 0; i < _layer_sizes[l+1]; i++)
 128         is >> _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j];
 129   }
 130 }
 131
 132 MultiLayerPerceptron::~MultiLayerPerceptron() {
 133   delete[] _weights;
 134   delete[] _activations;
 135   delete[] _pre_sigma_activations;
 136
 137   delete[] _layer_sizes;
 138   delete[] _frozen_layers;
 139   delete[] _weights_index;
 140   delete[] _activations_index;
 141 }
 142
 143 void MultiLayerPerceptron::save(ostream &os) {
 144   os << _nb_layers << "\n";
 145   for(int n = 0; n < _nb_layers; n++) os << _layer_sizes[n] << (n < _nb_layers - 1 ? " " : "\n");
 146   for(int l = 0; l < _nb_layers - 1; l++) {
 147     os << l << "\n";
 148     for(int j = 0; j < _layer_sizes[l]; j++) {
 149       for(int i = 0; i < _layer_sizes[l+1]; i++)
 150         os << _weights[_weights_index[l] + i * (_layer_sizes[l] + 1) + j] << (i < _layer_sizes[l+1] - 1 ? " " : "\n");
 151     }
 152   }
 153 }
 154
 155 void MultiLayerPerceptron::save_data() {
 156   for(int i = 0; i < _layer_sizes[1]; i++) {
 157     char buffer[256];
 158     sprintf(buffer, "/tmp/hidden_%03d.dat", i);
 159     ofstream stream(buffer);
 160     for(int j = 0; j < _layer_sizes[0]; j++) {
 161       if(j%28 == 0) stream << "\n";
 162       stream << j%28 << " " << j/28 << " " << _weights[i * (_layer_sizes[0] + 1) + j] << "\n";
 163     }
 164   }
 165 }
 166
 167 void MultiLayerPerceptron::init_random_weights(scalar_t stdd) {
 168   for(int w = 0; w < _nb_weights; w++) _weights[w] = normal_sample() * stdd;
 169 }
 170
 171 void MultiLayerPerceptron::compute_gradient_1s(ImageSet *is, int p, scalar_t *gradient_1s) {
 172   scalar_t dactivations[_nb_activations];
 173
 174   compute_activations_1s(is, p);
 175
 176   int nb_unfrozen = 0;
 177   for(int l = 0; l < _nb_layers - 1; l++) if(!_frozen_layers[l]) nb_unfrozen++;
 178
 179   for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
 180     scalar_t correct;
 181     if(is->label(p) == i) correct =   output_amplitude;
 182     else                  correct = - output_amplitude;
 183     dactivations[_activations_index[_nb_layers - 1] + i] = 2 * (_activations[_activations_index[_nb_layers - 1] + i] - correct);
 184   }
 185
 186   for(int l = _nb_layers - 2; (l >= 0) && (nb_unfrozen > 0); l--) {
 187     int ai0 = _activations_index[l], ai1 = _activations_index[l+1],
 188       wi0 = _weights_index[l], ls0p1 = _layer_sizes[l] + 1;
 189
 190     int j;
 191     for(j = 0; j < _layer_sizes[l]; j++) {
 192       scalar_t s = 0.0;
 193       for(int i = 0; i < _layer_sizes[l+1]; i++) {
 194         scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
 195         s += alpha * _weights[wi0 + i * ls0p1 + j];
 196         gradient_1s[wi0 + i * ls0p1 + j] = alpha * _activations[ai0 + j];
 197       }
 198       dactivations[ai0 + j] = s;
 199     }
 200
 201     for(int i = 0; i < _layer_sizes[l+1]; i++) {
 202       scalar_t alpha = dactivations[ai1 + i] * dsigma(_pre_sigma_activations[ai1 + i]);
 203       gradient_1s[wi0 + i * ls0p1 + j] = alpha;
 204     }
 205     if(!_frozen_layers[l]) nb_unfrozen--;
 206   }
 207 }
 208
 209 void MultiLayerPerceptron::compute_gradient(ImageSet *is, scalar_t *gradient) {
 210   scalar_t gradient_1s[_nb_weights];
 211   for(int w = 0; w < _nb_weights; w++) gradient[w] = 0.0;
 212   for(int p = 0; p < is->nb_pics(); p++) {
 213     compute_gradient_1s(is, p, gradient_1s);
 214     for(int w = 0; w < _nb_weights; w++) gradient[w] += gradient_1s[w];
 215   }
 216 }
 217
 218 void MultiLayerPerceptron::compute_numerical_gradient(ImageSet *is, scalar_t *gradient) {
 219   const scalar_t eps = 1e-3;
 220   scalar_t error_plus, error_minus, ref;
 221   for(int w = 0; w < _nb_weights; w++) {
 222     ref = _weights[w];
 223     _weights[w] = ref + eps;
 224     error_plus = error(is);
 225     _weights[w] = ref - eps;
 226     error_minus = error(is);
 227     _weights[w] = ref;
 228     gradient[w] = (error_plus - error_minus) / (2 * eps);
 229   }
 230 }
 231
 232 void MultiLayerPerceptron::print_gradient(ostream &os, scalar_t *gradient) {
 233   for(int w = 0; w < _nb_weights; w++) os << gradient[w] << "\n";
 234 }
 235
 236 void MultiLayerPerceptron::move_on_line(scalar_t *origin, scalar_t *gradient, scalar_t lambda) {
 237   for(int l = 0; l < _nb_layers-1; l++) if(!_frozen_layers[l]) {
 238     for(int i = 0; i < (_layer_sizes[l] + 1) * _layer_sizes[l+1]; i++)
 239       _weights[_weights_index[l] + i] =
 240         origin[_weights_index[l] + i] + lambda * gradient[_weights_index[l] + i];
 241   }
 242 }
 243
 244 void MultiLayerPerceptron::one_step_basic_gradient(ImageSet *is, scalar_t dt) {
 245   scalar_t gradient_1s[_nb_weights];
 246   for(int p = 0; p < is->nb_pics(); p++) {
 247     if(p%1000 == 0) {
 248       int n = (p*50)/is->nb_pics();
 249       int j;
 250       for(j = 0; j < n; j++) cout << "X";
 251       for(; j < 50; j++) cout << ((j%5 == 0) ? "+" : "-");
 252       cout << "\r";
 253       cout.flush();
 254     }
 255     compute_gradient_1s(is, p, gradient_1s);
 256     move_on_line(_weights, gradient_1s, -dt);
 257   }
 258   cout << "                                                            \r"; cout.flush();
 259 }
 260
 261 void MultiLayerPerceptron::one_step_global_gradient(ImageSet *is, scalar_t *xi, scalar_t *g, scalar_t *h) {
 262   scalar_t origin[_nb_weights];
 263   for(int w = 0; w < _nb_weights; w++) origin[w] = _weights[w];
 264
 265   scalar_t l = 1e-8;
 266   scalar_t e, pe;
 267
 268   e = error(is);
 269
 270   do {
 271     pe = e;
 272     l *= 2;
 273     move_on_line(origin, xi, l);
 274     e = error(is);
 275   } while(e < pe);
 276
 277   scalar_t dl = - l / 4;
 278
 279   while(abs(dl) > 1e-6) {
 280     move_on_line(origin, xi, l);
 281     e = error(is);
 282     do {
 283       pe = e;
 284       l  += dl;
 285       move_on_line(origin, xi, l);
 286       e = error(is);
 287     } while(e < pe);
 288     dl = - dl / 4;
 289   }
 290
 291   compute_gradient(is, xi);
 292
 293   scalar_t gg = 0, gxi = 0, xixi = 0;
 294
 295   // Polak-Ribiere
 296
 297   for(int w = 0; w < _nb_weights; w++) {
 298     gg += sq(g[w]);
 299     gxi += g[w] * xi[w];
 300     xixi += sq(xi[w]);
 301   }
 302
 303   scalar_t gamma = (xixi + gxi)/gg;
 304
 305   // Set gamma to 0 for standard gradient descente
 306   //   gamma = 0.0;
 307
 308   for(int w = 0; w < _nb_weights; w++) {
 309     g[w] = -xi[w];
 310     h[w] = g[w] + gamma * h[w];
 311     xi[w] = h[w];
 312   }
 313 }
 314
 315 void MultiLayerPerceptron::train(ImageSet *training_set, ImageSet *validation_set) {
 316   scalar_t prev_validation_error = 1.0, validation_error = 1.0, training_error;
 317   int l = 0, nb_increases = 0;
 318   do {
 319 // #warning horrible debugging
 320 //     {
 321 //       char buffer[1024];
 322 //       sprintf(buffer, "tmp_%04d.mlp", l);
 323 //       ofstream stream(buffer);
 324 //       save(stream);
 325 //       stream.flush();
 326 //     }
 327     prev_validation_error = validation_error;
 328     //     one_step_global_gradient(training_set, xi, g, h);
 329     one_step_basic_gradient(training_set, 1e-2);
 330     training_error = error(training_set);
 331     validation_error = error(validation_set);
 332     cout << l
 333          << " TRAINING " << training_error << " (" << classification_error(training_set)*100 << "%)"
 334          << " VALIDATION " << validation_error << " (" << classification_error(validation_set)*100 << "%)";
 335     if(l > 0 && validation_error >= prev_validation_error) {
 336       nb_increases++;
 337       cout << " [" << nb_increases << "]";
 338     }
 339     cout << "\n";
 340     cout.flush();
 341     l++;
 342   } while(nb_increases < 10);
 343
 344 }
 345
 346 void MultiLayerPerceptron::compute_activations_1s(ImageSet *is, int p) {
 347   ASSERT(_layer_sizes[0] == is->width() * is->height(), "The first layer has to have as many units as there are pixels!");
 348
 349   scalar_t *a = _activations;
 350   scalar_t *w = _weights;
 351
 352   for(int y = 0; y < is->height(); y++) for(int x = 0; x < is->width(); x++)
 353     *(a++) = 2.0 * (scalar_t(is->pixel(p, x, y)) / 255.0) - 1.0;
 354
 355   scalar_t *pa = _pre_sigma_activations + _activations_index[1];
 356   scalar_t *b = _activations, *b2 = 0;
 357
 358   for(int l = 0; l < _nb_layers - 1; l++) {
 359     for(int i = 0; i < _layer_sizes[l+1]; i++) {
 360       scalar_t s = 0;
 361       b2 = b;
 362       for(int j = 0; j < _layer_sizes[l]; j++) s += *(w++) * *(b2++);
 363       s += *(w++);
 364       *(pa++) = s;
 365       *(a++) = sigma(s);
 366     }
 367     b = b2;
 368   }
 369 }
 370
 371 void MultiLayerPerceptron::test(ImageSet *is, scalar_t *responses) {
 372   for(int p = 0; p < is->nb_pics(); p++) {
 373     compute_activations_1s(is, p);
 374     for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++)
 375       responses[p * _layer_sizes[_nb_layers - 1] + i] = _activations[_activations_index[_nb_layers - 1] + i];
 376   }
 377 }
 378
 379 scalar_t MultiLayerPerceptron::error(ImageSet *is) {
 380   scalar_t error = 0;
 381
 382   for(int p = 0; p < is->nb_pics(); p++) {
 383     compute_activations_1s(is, p);
 384     for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
 385       scalar_t correct;
 386       if(is->label(p) == i) correct =   output_amplitude;
 387       else                  correct = - output_amplitude;
 388       error += sq(_activations[_activations_index[_nb_layers - 1] + i] - correct);
 389     }
 390   }
 391
 392   return error;
 393 }
 394
 395 scalar_t MultiLayerPerceptron::classification_error(ImageSet *is) {
 396   int nb_error = 0;
 397
 398   for(int p = 0; p < is->nb_pics(); p++) {
 399     compute_activations_1s(is, p);
 400     scalar_t max = -1;
 401     int i_max = -1;
 402     for(int i = 0; i < _layer_sizes[_nb_layers - 1]; i++) {
 403       if(i_max < 0 || _activations[_activations_index[_nb_layers - 1] + i] > max) {
 404         i_max = i;
 405         max = _activations[_activations_index[_nb_layers - 1] + i];
 406       }
 407     }
 408     if(is->label(p) != i_max) nb_error++;
 409   }
 410
 411   return scalar_t(nb_error)/scalar_t(is->nb_pics());
 412 }