From 2455f83ba251602d5e04640067094f09f03aaa3d Mon Sep 17 00:00:00 2001 From: Francois Fleuret Date: Wed, 27 Mar 2013 12:21:52 +0100 Subject: [PATCH 1/1] Initial commit. --- Makefile | 52 ++++++ arrays.h | 122 ++++++++++++++ clueless-kmean.cc | 110 ++++++++++++ clusterer.cc | 421 ++++++++++++++++++++++++++++++++++++++++++++++ clusterer.h | 66 ++++++++ misc.cc | 93 ++++++++++ misc.h | 106 ++++++++++++ sample_set.cc | 45 +++++ sample_set.h | 40 +++++ test.sh | 54 ++++++ 10 files changed, 1109 insertions(+) create mode 100644 Makefile create mode 100644 arrays.h create mode 100644 clueless-kmean.cc create mode 100644 clusterer.cc create mode 100644 clusterer.h create mode 100644 misc.cc create mode 100644 misc.h create mode 100644 sample_set.cc create mode 100644 sample_set.h create mode 100755 test.sh diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..c8463c8 --- /dev/null +++ b/Makefile @@ -0,0 +1,52 @@ + +# clueless-kmean is a variant of k-mean which enforces balanced +# distribution of classes in every cluster +# +# Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ +# Written by Francois Fleuret +# +# This file is part of clueless-kmean. +# +# clueless-kmean is free software: you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# version 3 as published by the Free Software Foundation. +# +# clueless-kmean is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with selector. If not, see . + +ifeq ($(STATIC),yes) + LDFLAGS=-static -lm -ljpeg -lpng -lz -lglpk +else + LDFLAGS= -lm -ljpeg -lpng -lz -lglpk +endif + +ifeq ($(DEBUG),yes) + OPTIMIZE_FLAG = -ggdb3 -DDEBUG -fno-omit-frame-pointer +else + OPTIMIZE_FLAG = -ggdb3 -O3 +endif + +ifeq ($(PROFILE),yes) + PROFILE_FLAG = -pg +endif + +CXXFLAGS = -Wall $(OPTIMIZE_FLAG) $(PROFILE_FLAG) $(CXXGLPK) + +# LDFLAGS=-lglpk + +all: clueless-kmean + +clueless-kmean: \ + misc.o \ + sample_set.o \ + clusterer.o \ + clueless-kmean.o + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) + +clean: + rm -f *.o clueless-kmean diff --git a/arrays.h b/arrays.h new file mode 100644 index 0000000..c422498 --- /dev/null +++ b/arrays.h @@ -0,0 +1,122 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#ifndef ARRAYS_H +#define ARRAYS_H + +template +T inline sqdist(int dim, T *x, T *y) { + T result = 0; + for(int k = 0; k < dim; k++) result += sq(x[k] - y[k]); + return result; +} + +template +T inline sum(int dim, T *x) { + T result = 0; + for(int k = 0; k < dim; k++) result += x[k]; + return result; +} + +template +T inline dotprod(int dim, T *x, T *y) { + T result = 0; + for(int k = 0; k < dim; k++) result += x[k] * y[k]; + return result; +} + +template +void fill_vector(int dim, T *vector, T v) { + for(int k = 0; k < dim; k++) { + vector[k] = v; + } +} + +template +void copy_vector(int dim, T *vector_dst, T *vector_src) { + for(int k = 0; k < dim; k++) { + vector_dst[k] = vector_src[k]; + } +} + +template +T **allocate_array(int a, int b) { + T *whole = new T[a * b]; + T **array = new T *[a]; + for(int k = 0; k < a; k++) { + array[k] = whole; + whole += b; + } + return array; +} + +template +void deallocate_array(T **array) { + if(array) { + delete[] array[0]; + delete[] array; + } +} + +template +void fill_array(int a, int b, T **array, T v) { + for(int k = 0; k < a * b; k++) { + array[0][k] = v; + } +} + +template +T ***allocate_volume(int a, int b, int c) { + T *whole = new T[a * b * c]; + T **column = new T *[a * b]; + T ***volume = new T **[a]; + + for(int k = 0; k < a; k++) { + volume[k] = column; + for(int l = 0; l < b; l++) { + column[l] = whole; + whole += c; + } + column += b; + } + + return volume; +} + +template +void deallocate_volume(T ***volume) { + if(volume) { + delete[] volume[0][0]; + delete[] volume[0]; + delete[] volume; + } +} + +template +void fill_volume(int a, int b, int c, T ***volume, T v) { + for(int k = 0; k < a * b * c; k++) { + volume[0][0][k] = v; + } +} + +#endif diff --git a/clueless-kmean.cc b/clueless-kmean.cc new file mode 100644 index 0000000..1eea96f --- /dev/null +++ b/clueless-kmean.cc @@ -0,0 +1,110 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#include +#include +#include +#include +#include +#include + +using namespace std; + +#include "misc.h" +#include "arrays.h" +#include "sample_set.h" +#include "clusterer.h" + +void generate_toy_problem(SampleSet *sample_set) { + int dim = 2; + int nb_points = 1000; + + sample_set->resize(dim, nb_points); + sample_set->nb_classes = 2; + + for(int n = 0; n < nb_points; n++) { + sample_set->labels[n] = int(drand48() * 2); + if(sample_set->labels[n] == 0) { + sample_set->points[n][0] = (2 * drand48() - 1) * 0.8; + sample_set->points[n][1] = - 0.6 + (2 * drand48() - 1) * 0.4; + } else { + sample_set->points[n][0] = (2 * drand48() - 1) * 0.4; + sample_set->points[n][1] = 0.6 + (2 * drand48() - 1) * 0.4; + } + } +} + +int main(int argc, char **argv) { + SampleSet sample_set; + Clusterer clusterer; + int nb_clusters = 3; + + generate_toy_problem(&sample_set); + + { + ofstream out("points.dat"); + for(int n = 0; n < sample_set.nb_points; n++) { + out << sample_set.labels[n]; + for(int d = 0; d < sample_set.dim; d++) { + out << " " << sample_set.points[n][d]; + } + out << endl; + } + } + + int *associated_clusters = new int[sample_set.nb_points]; + + glp_term_out(0); + + clusterer.train(nb_clusters, sample_set.dim, + sample_set.nb_points, sample_set.points, + sample_set.nb_classes, sample_set.labels, + associated_clusters); + + { + ofstream out("associated_clusters.dat"); + for(int n = 0; n < sample_set.nb_points; n++) { + out << associated_clusters[n]; + for(int d = 0; d < sample_set.dim; d++) { + out << " " << sample_set.points[n][d]; + } + out << endl; + } + } + + { + ofstream out("clusters.dat"); + for(int k = 0 ; k < clusterer._nb_clusters; k++) { + out << k; + for(int d = 0; d < sample_set.dim; d++) { + out << " " << clusterer._cluster_means[k][d]; + } + for(int d = 0; d < sample_set.dim; d++) { + out << " " << 2 * sqrt(clusterer._cluster_var[k][d]); + } + out << endl; + } + } + + delete[] associated_clusters; +} diff --git a/clusterer.cc b/clusterer.cc new file mode 100644 index 0000000..42af81b --- /dev/null +++ b/clusterer.cc @@ -0,0 +1,421 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#include "clusterer.h" +#include + +Clusterer::Clusterer() { + _cluster_means = 0; + _cluster_var = 0; +} + +Clusterer::~Clusterer() { + deallocate_array(_cluster_means); + deallocate_array(_cluster_var); +} + +scalar_t Clusterer::baseline_cluster_association(int nb_points, scalar_t **points, + int nb_classes, int *labels, + scalar_t **gamma) { + int *associated_clusters = new int[nb_points]; + scalar_t total_dist = 0; + + for(int n = 0; n < nb_points; n++) { + scalar_t lowest_dist = 0; + for(int k = 0; k < _nb_clusters; k++) { + scalar_t dist = 0; + + for(int d = 0; d < _dim; d++) { + dist += sq(_cluster_means[k][d] - points[n][d]) / (2 * _cluster_var[k][d]); + dist += 0.5 * log(_cluster_var[k][d]); + ASSERT(!isnan(dist) && !isinf(dist)); + } + + if(k == 0 || dist <= lowest_dist) { + lowest_dist = dist; + associated_clusters[n] = k; + } + } + + total_dist += lowest_dist; + } + + for(int n = 0; n < nb_points; n++) { + for(int k = 0; k < _nb_clusters; k++) { + gamma[n][k] = 0.0; + } + gamma[n][associated_clusters[n]] = 1.0; + } + + delete[] associated_clusters; + + return total_dist; +} + +scalar_t Clusterer::baseline_lp_cluster_association(int nb_points, scalar_t **points, + int nb_classes, int *labels, + scalar_t **gamma) { + glp_prob *lp; + + int *ia = new int[nb_points * _nb_clusters + 1]; + int *ja = new int[nb_points * _nb_clusters + 1]; + scalar_t *ar = new scalar_t[nb_points * _nb_clusters + 1]; + + lp = glp_create_prob(); + + glp_set_prob_name(lp, "baseline_lp_cluster_association"); + glp_set_obj_dir(lp, GLP_MIN); + + glp_add_rows(lp, nb_points); + + for(int n = 1; n <= nb_points; n++) { + glp_set_row_bnds(lp, n, GLP_FX, 1.0, 1.0); + } + + glp_add_cols(lp, nb_points * _nb_clusters); + for(int k = 1; k <= _nb_clusters; k++) { + for(int n = 1; n <= nb_points; n++) { + int i = n + nb_points * (k - 1); + + scalar_t dist = 0; + + for(int d = 0; d < _dim; d++) { + dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]); + dist += 0.5 * log(_cluster_var[k-1][d]); + } + + glp_set_obj_coef(lp, i, dist); + glp_set_col_bnds(lp, i, GLP_DB, 0.0, 1.0); + } + } + + int l = 1; + + for(int n = 1; n <= nb_points; n++) { + for(int k = 1; k <= _nb_clusters; k++) { + ia[l] = n; + ja[l] = n + nb_points * (k - 1); + ar[l] = 1.0; + l++; + } + } + + glp_load_matrix(lp, nb_points * _nb_clusters, ia, ja, ar); + + glp_simplex(lp, NULL); + + scalar_t total_dist = glp_get_obj_val(lp); + + for(int k = 1; k <= _nb_clusters; k++) { + for(int n = 1; n <= nb_points; n++) { + int i = n + nb_points * (k - 1); + gamma[n-1][k-1] = glp_get_col_prim(lp, i); + } + } + + delete[] ia; + delete[] ja; + delete[] ar; + + glp_delete_prob(lp); + + return total_dist; +} + +scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t **points, + int nb_classes, int *labels, + scalar_t **gamma) { + glp_prob *lp; + + int nb_coeffs = nb_points * _nb_clusters + nb_points * _nb_clusters; + + int *ia = new int[nb_coeffs + 1]; + int *ja = new int[nb_coeffs + 1]; + scalar_t *ar = new scalar_t[nb_coeffs + 1]; + + scalar_t *nb_samples_per_class = new scalar_t[nb_classes]; + for(int c = 0; c < nb_classes; c++) { + nb_samples_per_class[c] = 0.0; + } + + for(int n = 0; n < nb_points; n++) { + nb_samples_per_class[labels[n]] += 1.0; + } + + lp = glp_create_prob(); + + glp_set_prob_name(lp, "uninformative_lp_cluster_association"); + glp_set_obj_dir(lp, GLP_MIN); + + glp_add_rows(lp, nb_points + _nb_clusters * nb_classes); + + for(int n = 1; n <= nb_points; n++) { + glp_set_row_bnds(lp, n, GLP_FX, 1.0, 1.0); + } + + for(int k = 1; k <= _nb_clusters; k++) { + for(int c = 1; c <= nb_classes; c++) { + int row = nb_points + (k - 1) * nb_classes + c; + scalar_t tau = nb_samples_per_class[c-1] / scalar_t(_nb_clusters); + // cout << "tau " << k << " " << c << " " << tau << endl; + glp_set_row_bnds(lp, row, GLP_FX, tau, tau); + } + } + + glp_add_cols(lp, nb_points * _nb_clusters); + + for(int k = 1; k <= _nb_clusters; k++) { + for(int n = 1; n <= nb_points; n++) { + int r = n + nb_points * (k - 1); + + scalar_t dist = 0; + + for(int d = 0; d < _dim; d++) { + dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]); + dist += 0.5 * log(_cluster_var[k-1][d]); + } + + glp_set_obj_coef(lp, r, dist); + glp_set_col_bnds(lp, r, GLP_DB, 0.0, 1.0); + } + } + + int l = 1; + + for(int n = 1; n <= nb_points; n++) { + for(int k = 1; k <= _nb_clusters; k++) { + int row = n; + ia[l] = row; + ja[l] = nb_points * (k - 1) + n; + ar[l] = 1.0; + l++; + } + } + + for(int k = 1; k <= _nb_clusters; k++) { + for(int c = 1; c <= nb_classes; c++) { + int row = nb_points + (k - 1) * nb_classes + c; + for(int n = 1; n <= nb_points; n++) { + if(labels[n-1] == c - 1) { + ia[l] = row; + ja[l] = (k-1) * nb_points + n; + ar[l] = 1.0; + l++; + } + } + } + } + + ASSERT(l == nb_coeffs + 1); + + glp_load_matrix(lp, nb_coeffs, ia, ja, ar); + + glp_simplex(lp, NULL); + + scalar_t total_dist = glp_get_obj_val(lp); + + for(int k = 1; k <= _nb_clusters; k++) { + for(int n = 1; n <= nb_points; n++) { + int i = n + nb_points * (k - 1); + gamma[n-1][k-1] = glp_get_col_prim(lp, i); + } + } + + // { // ******************************* START *************************** +// #warning Test code added on 2013 Feb 07 20:32:05 + // // for(int n = 0; n < nb_points; n++) { + // // scalar_t sum = 0; + // // for(int k = 0; k < _nb_clusters; k++) { + // // ASSERT(gamma[n][k] >= 0 && gamma[n][k] <= 1); + // // sum += gamma[n][k]; + // // } + // // cout << sum << endl; + // // } + + // scalar_t *sum_gamma = new scalar_t[nb_classes]; + + // for(int k = 0; k < _nb_clusters; k++) { + // for(int c = 0; c < nb_classes; c++) { sum_gamma[c] = 0.0; } + // for(int n = 0; n < nb_points; n++) { + // sum_gamma[labels[n]] += gamma[n][k]; + // } + // cout << "CLUSTER" << k; + // for(int c = 0; c < nb_classes; c++) { + // cout << " " << sum_gamma[c]; + // } + // cout << endl; + // } + + // delete sum_gamma; + + // } // ******************************** END **************************** + + delete[] nb_samples_per_class; + delete[] ia; + delete[] ja; + delete[] ar; + glp_delete_prob(lp); + + return total_dist; +} + +void Clusterer::baseline_update_clusters(int nb_points, scalar_t **points, scalar_t **gamma) { + for(int k = 0; k < _nb_clusters; k++) { + + for(int d = 0; d < _dim; d++) { + _cluster_means[k][d] = 0.0; + _cluster_var[k][d] = 0.0; + } + + scalar_t sum_gamma = 0; + for(int n = 0; n < nb_points; n++) { + sum_gamma += gamma[n][k]; + for(int d = 0; d < _dim; d++) { + _cluster_means[k][d] += gamma[n][k] * points[n][d]; + _cluster_var[k][d] += gamma[n][k] * sq(points[n][d]); + } + } + + ASSERT(sum_gamma >= 1); + + for(int d = 0; d < _dim; d++) { + if(sum_gamma >= 2) { + _cluster_var[k][d] = (_cluster_var[k][d] - sq(_cluster_means[k][d]) / sum_gamma) / (sum_gamma - 1); + } else { + _cluster_var[k][d] = 1; + } + _cluster_var[k][d] = max(0.01, _cluster_var[k][d]); + _cluster_means[k][d] /= sum_gamma; + } + } +} + +void Clusterer::initialize_clusters(int nb_points, scalar_t **points) { + int *used = new int[nb_points]; + for(int k = 0; k < nb_points; k++) { used[k] = 0; } + for(int k = 0; k < _nb_clusters; k++) { + int l; + do { l = int(drand48() * nb_points); } while(used[l]); + for(int d = 0; d < _dim; d++) { + _cluster_means[k][d] = points[l][d]; + _cluster_var[k][d] = 1.0; + } + used[l] = 1; + } + delete[] used; +} + +void Clusterer::train(int nb_clusters, int dim, + int nb_points, scalar_t **points, + int nb_classes, int *labels, + int *cluster_associations) { + deallocate_array(_cluster_means); + deallocate_array(_cluster_var); + _nb_clusters = nb_clusters; + _dim = dim; + _cluster_means = allocate_array(_nb_clusters, _dim); + _cluster_var = allocate_array(_nb_clusters, _dim); + + scalar_t **gammas = allocate_array(nb_points, _nb_clusters); + + if(nb_clusters > nb_points) abort(); + + initialize_clusters(nb_points, points); + + scalar_t pred_total_distance, total_distance = FLT_MAX; + int nb_rounds = 0; + + do { + pred_total_distance = total_distance; + total_distance = + // baseline_cluster_association(nb_points, points, nb_classes, labels, gammas); + // baseline_lp_cluster_association(nb_points, points, nb_classes, labels, gammas); + uninformative_lp_cluster_association(nb_points, points, nb_classes, labels, gammas); + cout << "TRAIN " << nb_rounds << " " << total_distance << endl; + baseline_update_clusters(nb_points, points, gammas); + nb_rounds++; + } while(total_distance < min_iteration_improvement * pred_total_distance && + nb_rounds < max_nb_iterations); + + { + cout << "TOTAL_NB_SAMPLES"; + for(int c = 0; c < nb_classes; c++) { + int nb_samples = 0; + for(int n = 0; n < nb_points; n++) { + if(labels[n] == c) { + nb_samples++; + } + } + cout << " " << nb_samples; + } + cout << endl; + + for(int k = 0; k < _nb_clusters; k++) { + cout << "CLUSTER_GAMMA_SUM " << k << " :"; + for(int c = 0; c < nb_classes; c++) { + scalar_t sum = 0.0; + for(int n = 0; n < nb_points; n++) { + if(labels[n] == c) { + sum += gammas[n][k]; + } + } + cout << " " << sum; + } + cout << endl; + } + } + + for(int n = 0; n < nb_points; n++) { + for(int k = 0; k < _nb_clusters; k++) { + if(k == 0 || gammas[n][k] > gammas[n][cluster_associations[n]]) { + cluster_associations[n] = k; + } + } + } + + deallocate_array(gammas); +} + +int Clusterer::cluster(scalar_t *point) { + scalar_t lowest_dist = 0; + int associated_cluster = -1; + + for(int k = 0; k < _nb_clusters; k++) { + scalar_t dist = 0; + + for(int d = 0; d < _dim; d++) { + dist += sq(_cluster_means[k][d] - point[d]) / (2 * _cluster_var[k][d]); + dist += 0.5 * log(_cluster_var[k][d]); + ASSERT(!isnan(dist) && !isinf(dist)); + } + + if(k == 0 || dist <= lowest_dist) { + lowest_dist = dist; + associated_cluster = k; + } + } + + ASSERT(associated_cluster >= 0); + + return associated_cluster; +} diff --git a/clusterer.h b/clusterer.h new file mode 100644 index 0000000..88c168a --- /dev/null +++ b/clusterer.h @@ -0,0 +1,66 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#ifndef CLUSTERER_H +#define CLUSTERER_H + +#include "misc.h" +#include "arrays.h" + +class Clusterer { +public: + const static int max_nb_iterations = 10; + const static scalar_t min_iteration_improvement = 0.999; + + int _nb_clusters; + int _dim; + scalar_t **_cluster_means, **_cluster_var; + + void initialize_clusters(int nb_points, scalar_t **points); + + scalar_t baseline_cluster_association(int nb_points, scalar_t **points, + int nb_classes, int *labels, + scalar_t **gamma); + + scalar_t baseline_lp_cluster_association(int nb_points, scalar_t **points, + int nb_classes, int *labels, + scalar_t **gamma); + + scalar_t uninformative_lp_cluster_association(int nb_points, scalar_t **points, + int nb_classes, int *labels, + scalar_t **gamma); + + void baseline_update_clusters(int nb_points, scalar_t **points, scalar_t **gamma); + +public: + Clusterer(); + ~Clusterer(); + void train(int nb_clusters, int dim, + int nb_points, scalar_t **points, + int nb_classes, int *labels, + int *cluster_associations); + + int cluster(scalar_t *point); +}; + +#endif diff --git a/misc.cc b/misc.cc new file mode 100644 index 0000000..e52fbc5 --- /dev/null +++ b/misc.cc @@ -0,0 +1,93 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#include + +using namespace std; + +#include "misc.h" + +char *next_word(char *buffer, char *r, int buffer_size) { + char *s; + s = buffer; + + if(r != 0) { + while((*r == ' ') || (*r == '\t') || (*r == ',')) r++; + if(*r == '"') { + r++; + while((*r != '"') && (*r != '\0') && + (s 0) { + s += n[k] * log(scalar_t(n[k])); + t += n[k]; + } + return (log(t) - s/scalar_t(t))/log(2.0); +} + +void random_permutation(int *val, int nb) { + for(int k = 0; k < nb; k++) val[k] = k; + int i, t; + for(int k = 0; k < nb - 1; k++) { + i = int(drand48() * (nb - k)) + k; + t = val[i]; + val[i] = val[k]; + val[k] = t; + } +} + +void tag_subset(bool *val, int nb_total, int nb_to_tag) { + ASSERT(nb_to_tag <= nb_total); + int index[nb_total]; + random_permutation(index, nb_total); + for(int n = 0; n < nb_total; n++) val[n] = false; + for(int n = 0; n < nb_to_tag; n++) val[index[n]] = true; +} + +int compare_couple(const void *a, const void *b) { + if(((Couple *) a)->value < ((Couple *) b)->value) return -1; + else if(((Couple *) a)->value > ((Couple *) b)->value) return 1; + else return 0; +} diff --git a/misc.h b/misc.h new file mode 100644 index 0000000..f80966e --- /dev/null +++ b/misc.h @@ -0,0 +1,106 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#ifndef MISC_H +#define MISC_H + +#include +#include +#include +#include +#include +#include + +using namespace std; + +typedef double scalar_t; +// typedef float scalar_t; + +const int buffer_size = 1024; + +using namespace std; + +#ifdef DEBUG +#define ASSERT(x) if(!(x)) { \ + std::cerr << "ASSERT FAILED IN " << __FILE__ << ":" << __LINE__ << endl; \ + abort(); \ +} +#else +#define ASSERT(x) +#endif + +template +T smooth_min(T x, T y) { + T z = exp(x - y); + return 0.5 * (x + y - (x - y)/(1 + 1/z) - (y - x)/(1 + z)); +} + +template +void write_var(ostream *os, const T *x) { os->write((char *) x, sizeof(T)); } + +template +void read_var(istream *is, T *x) { is->read((char *) x, sizeof(T)); } + +template +void grow(int *nb_max, int nb, T** current, int factor) { + ASSERT(*nb_max > 0); + if(nb == *nb_max) { + T *tmp = new T[*nb_max * factor]; + memcpy(tmp, *current, *nb_max * sizeof(T)); + delete[] *current; + *current = tmp; + *nb_max *= factor; + } +} + +template +inline T sq(T x) { + return x * x; +} + +inline scalar_t log2(scalar_t x) { + return log(x)/log(2.0); +} + +inline scalar_t xi(scalar_t x) { + if(x <= 0.0) return 0.0; + else return - x * log(x)/log(2.0); +} + +scalar_t discrete_entropy(int *n, int nb); + +char *basename(char *name); + +char *next_word(char *buffer, char *r, int buffer_size); + +void random_permutation(int *val, int nb); +void tag_subset(bool *val, int nb_total, int nb_to_tag); + +struct Couple { + int index; + scalar_t value; +}; + +int compare_couple(const void *a, const void *b); + +#endif diff --git a/sample_set.cc b/sample_set.cc new file mode 100644 index 0000000..7b94f92 --- /dev/null +++ b/sample_set.cc @@ -0,0 +1,45 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#include "sample_set.h" +#include "arrays.h" + +SampleSet::SampleSet() { + labels = 0; + points = 0; +} + +SampleSet::~SampleSet() { + delete[] labels; + deallocate_array(points); +} + +void SampleSet::resize(int d, int np) { + delete[] labels; + deallocate_array(points); + nb_points = np; + dim = d; + points = allocate_array(nb_points, dim); + labels = new int[nb_points]; +} + diff --git a/sample_set.h b/sample_set.h new file mode 100644 index 0000000..e75738c --- /dev/null +++ b/sample_set.h @@ -0,0 +1,40 @@ +/* + * clueless-kmean is a variant of k-mean which enforces balanced + * distribution of classes in every cluster + * + * Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ + * Written by Francois Fleuret + * + * This file is part of clueless-kmean. + * + * clueless-kmean is free software: you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 3 as published by the Free Software Foundation. + * + * clueless-kmean is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with selector. If not, see . + * + */ + +#ifndef SAMPLE_SET_H +#define SAMPLE_SET_H + +#include "misc.h" + +class SampleSet { +public: + int nb_points, dim; + int nb_classes, *labels; + scalar_t **points; + + SampleSet(); + ~SampleSet(); + void resize(int d, int np); +}; + +#endif diff --git a/test.sh b/test.sh new file mode 100755 index 0000000..b486b03 --- /dev/null +++ b/test.sh @@ -0,0 +1,54 @@ +#!/bin/bash + + +# clueless-kmean is a variant of k-mean which enforces balanced +# distribution of classes in every cluster +# +# Copyright (c) 2013 Idiap Research Institute, http://www.idiap.ch/ +# Written by Francois Fleuret +# +# This file is part of clueless-kmean. +# +# clueless-kmean is free software: you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# version 3 as published by the Free Software Foundation. +# +# clueless-kmean is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with selector. If not, see . + +set -e + +make -j -k + +./clueless-kmean + +CLUSTER1=($(grep ^0 clusters.dat)) +CLUSTER2=($(grep ^1 clusters.dat)) +CLUSTER3=($(grep ^2 clusters.dat)) + +gnuplot <