2 //////////////////////////////////////////////////////////////////////////////////
3 // This program is free software: you can redistribute it and/or modify //
4 // it under the terms of the version 3 of the GNU General Public License //
5 // as published by the Free Software Foundation. //
7 // This program is distributed in the hope that it will be useful, but //
8 // WITHOUT ANY WARRANTY; without even the implied warranty of //
9 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU //
10 // General Public License for more details. //
12 // You should have received a copy of the GNU General Public License //
13 // along with this program. If not, see <http://www.gnu.org/licenses/>. //
15 // Written by Francois Fleuret //
16 // Copyright (C) Ecole Polytechnique Federale de Lausanne //
17 // Contact <francois.fleuret@epfl.ch> for comments & bug reports //
18 //////////////////////////////////////////////////////////////////////////////////
20 // $Id: classifier.cc,v 1.3 2007-08-23 08:36:50 fleuret Exp $
25 #include "classifier.h"
27 DataSet::DataSet(const DataSet &ds) {
31 DataSet::DataSet(int ns, int nf) {
35 size = (nb_samples + 31)/32;
40 raw->x = new uint32_t[nb_features * size];
41 raw->y = new uint32_t[size];
44 x_va = new uint32_t *[nb_features];
45 for(int j = 0; j < nb_features; j++) x_va[j] = raw->x + size*j;
48 DataSet::DataSet(istream &is) {
49 is >> nb_samples >> nb_features;
50 size = (nb_samples + 31)/32;
55 raw->x = new uint32_t[nb_features * size];
56 raw->y = new uint32_t[size];
59 x_va = new uint32_t *[nb_features];
60 for(int j = 0; j < nb_features; j++) x_va[j] = raw->x + size*j;
64 for(int s = 0; s < nb_samples; s++) {
65 // cerr << "s=" << s << "\n"; cerr.flush();
66 for(int f = 0; f < nb_features; f++) {
68 fe_set_bit(s, x_va[f], v > 0);
72 fe_set_bit(s, y_va, v > 0);
75 cerr << "Error: missing data!\n";
81 DataSet::DataSet(const DataSet &ds, const FeatureSelector &fs) {
82 nb_samples = ds.nb_samples;
83 nb_features = fs.nb_selected_features;
84 size = (nb_samples + 31)/32;
91 x_va = new uint32_t *[nb_features];
92 for(int j = 0; j < nb_features; j++) x_va[j] = raw->x + size*fs.selected_index[j];
95 DataSet::DataSet(const DataSet &ds, bool *selected_samples) {
97 for(int i = 0; i < ds.nb_samples; i++) if(selected_samples[i]) nb_samples++;
98 nb_features = ds.nb_features;
99 size = (nb_samples + 31)/32;
103 raw->x = new uint32_t[nb_features * size];
104 raw->y = new uint32_t[size];
107 x_va = new uint32_t *[nb_features];
108 for(int j = 0; j < nb_features; j++) x_va[j] = raw->x + size*j;
111 for(int s = 0; s < ds.nb_samples; s++) if(selected_samples[s]) {
112 fe_set_bit(k, y_va, fe_get_bit(s, ds.y_va));
113 for(int j = 0; j < nb_features; j++) fe_set_bit(k, x_va[j], fe_get_bit(s, ds.x_va[j]));
118 DataSet &DataSet::operator = (const DataSet &ds) {
122 if(raw->nrefs == 0) {
132 DataSet::~DataSet() {
135 if(raw->nrefs == 0) {
142 void DataSet::copy(const DataSet &ds) {
143 nb_samples = ds.nb_samples;
144 nb_features = ds.nb_features;
145 size = (nb_samples + 31)/32;
149 x_va = new uint32_t *[nb_features];
150 for(int j = 0; j < nb_features; j++) x_va[j] = ds.x_va[j];
153 void DataSet::save_ascii(ostream &os) {
154 os << nb_samples << " " << nb_features << "\n";
155 for(int s = 0; s < nb_samples; s++) {
156 for(int f = 0; f < nb_features; f++) {
157 if(fe_get_bit(s, x_va[f])) os << "1"; else os << "0";
158 if(f < nb_features-1) os << " "; else os << "\n";
160 if(fe_get_bit(s, y_va)) os << "1\n"; else os << "0\n";
164 Classifier *Classifier::load(istream &is) {
171 case Classifier::ID_LINEAR:
172 result = new LinearClassifier();
177 cerr << "Unknown classifier type!\n";
181 result->inner_load(is);
186 Classifier::~Classifier() {}
188 FeatureSelector::FeatureSelector(int nb) : nb_selected_features(nb),
189 selected_index(new int[nb_selected_features]),
190 weights(new float[nb_selected_features]) { }
192 FeatureSelector::~FeatureSelector() {
194 delete[] selected_index;
197 LinearClassifier::LinearClassifier(int nf) : nb_features(nf),
198 weights(new float[nb_features]),
201 LinearClassifier::LinearClassifier() : nb_features(0),
205 LinearClassifier::~LinearClassifier() {
209 void LinearClassifier::compute_bayesian_weights(int nb_samples, uint32_t *y_va, uint32_t **x_va) {
210 for(int nf = 0; nf < nb_features; nf++) {
211 int n11 = fe_count_and(nb_samples, y_va, x_va[nf]);
212 int n10 = fe_count_and_not(nb_samples, y_va, x_va[nf]);
213 int n01 = fe_count_and_not(nb_samples, x_va[nf], y_va);
214 int n00 = fe_count_and_not_not(nb_samples, y_va, x_va[nf]);
215 if(n00 == 0) n00 = 1; // This is sort of a dirty way to emulate +/- infty
216 if(n01 == 0) n01 = 1;
217 if(n10 == 0) n10 = 1;
218 if(n11 == 0) n11 = 1;
219 weights[nf] = log(float(n11 * n00) / float(n10 * n01));
223 void LinearClassifier::compute_bias(int nb_samples, uint32_t *y_va, uint32_t **x_va, bool balanced_error) {
224 Couple tmp[nb_samples];
226 int n00 = 0, n01 = 0, n10 = 0, n11 = 0;
227 for(int s = 0; s < nb_samples; s++) {
229 for(int nf = 0; nf < nb_features; nf++) if(fe_get_bit(s, x_va[nf])) r += weights[nf];
232 if(fe_get_bit(s, y_va)) n11++; else n01++;
235 qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
237 float error, best_error = 2.0;
239 for(int t = 0; t < nb_samples-1; t++) {
240 if(fe_get_bit(tmp[t].index, y_va)) { n10++; n11--; } else { n01--; n00++; }
243 error = (float(n01) / float(n00 + n01) + float(n10) / float(n10 + n11)) / 2.0;
245 error = float(n01+n10) / float(n00 + n01 + n10 + n11);
247 if(error < best_error) {
249 bias = - (tmp[t].value + tmp[t+1].value)/2.0;
254 void LinearClassifier::learn_bayesian(const DataSet &ds, bool balanced_error) {
255 compute_bayesian_weights(ds.nb_samples, ds.y_va, ds.x_va);
256 compute_bias(ds.nb_samples, ds.y_va, ds.x_va, balanced_error);
259 void LinearClassifier::learn_perceptron(const DataSet &ds, bool balanced_error) {
260 for(int i = 0; i < nb_features; i++) weights[i] = 0.0;
262 int n_loop_max = 5000;
264 for(int i = 0; i < n_loop_max * ds.nb_samples; i++) {
265 int ns = i % ds.nb_samples;
267 for(int f = 0; f < nb_features; f++)
268 if(fe_get_bit(ns, ds.x_va[f])) r += weights[f]; else r -= weights[f];
270 if(fe_get_bit(ns, ds.y_va)) correct = 1.0; else correct = -1.0;
271 if((r < 0 && correct >= 0) || (r >= 0 && correct < 0)) {
272 for(int f = 0; f < nb_features; f++)
273 if(fe_get_bit(ns, ds.x_va[f])) weights[f] += correct; else weights[f] += -correct;
277 compute_bias(ds.nb_samples, ds.y_va, ds.x_va, balanced_error);
280 void FeatureSelector::cmim(const DataSet &ds) {
281 fe_selection_cmim(ds.nb_samples, ds.nb_features, ds.x_va, ds.y_va, nb_selected_features, selected_index);
284 void FeatureSelector::mim(const DataSet &ds) {
285 fe_selection_mim(ds.nb_samples, ds.nb_features, ds.x_va, ds.y_va, nb_selected_features, selected_index);
288 void FeatureSelector::random(const DataSet &ds) {
289 bool used[ds.nb_features];
290 for(int i = 0; i < ds.nb_features; i++) used[i] = false;
292 for(int nf = 0; nf < nb_selected_features; nf++) {
293 do { f = int(drand48() * ds.nb_features); } while(used[f]);
295 selected_index[nf] = f;
299 FeatureSelector::FeatureSelector(istream &is) {
300 is >> nb_selected_features;
301 weights = new float[nb_selected_features];
302 selected_index = new int[nb_selected_features];
303 for(int i = 0; i < nb_selected_features; i++) is >> selected_index[i];
306 void FeatureSelector::save(ostream &os) {
307 os << nb_selected_features << "\n";
308 for(int i = 0; i < nb_selected_features; i++) os << selected_index[i] << ((i < nb_selected_features-1) ? " " : "\n");
311 void LinearClassifier::inner_load(istream &is) {
314 weights = new float[nb_features];
315 for(int i = 0; i < nb_features; i++) is >> weights[i];
319 void LinearClassifier::save(ostream &os) {
320 os << ID_LINEAR << "\n";
321 os << nb_features << "\n";
322 for(int i = 0; i < nb_features; i++) os << weights[i] << ((i < nb_features-1) ? " " : "\n");
326 void LinearClassifier::predict(const DataSet &ds, float *result) {
327 for(int s = 0; s < ds.nb_samples; s++) {
329 for(int nf = 0; nf < nb_features; nf++) if(fe_get_bit(s, ds.x_va[nf])) r += weights[nf];
334 void compute_error_rates(FeatureSelector *selector, Classifier *classifier,
335 const DataSet &testing_set, int &n00, int &n01, int &n10, int &n11, float *result) {
337 DataSet reduced_test_set(testing_set, *selector);
339 classifier->predict(reduced_test_set, result);
341 n00 = 0; n01 = 0; n10 = 0; n11 = 0;
342 for(int s = 0; s < testing_set.nb_samples; s++) {
343 if(fe_get_bit(s, testing_set.y_va)) {
344 if(result[s] >= 0) n11++; else n10++;
346 if(result[s] >= 0) n01++; else n00++;