From: Francois Fleuret Date: Wed, 27 Mar 2013 12:04:56 +0000 (+0100) Subject: Added distance_to_centroid. X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;h=e83a6c8bc9daf4e36a82796cd7ca4ed7f6d686da;p=clueless-kmeans.git Added distance_to_centroid. --- diff --git a/clusterer.cc b/clusterer.cc index 9c5e7cb..3c33f3c 100644 --- a/clusterer.cc +++ b/clusterer.cc @@ -34,6 +34,31 @@ Clusterer::~Clusterer() { deallocate_array(_cluster_var); } +scalar_t Clusterer::distance_to_centroid(scalar_t *x, int k) { + scalar_t dist = 0; + for(int d = 0; d < _dim; d++) { + dist += sq(_cluster_means[k][d] - x[d]) / (2 * _cluster_var[k][d]); + dist += 0.5 * log(_cluster_var[k][d]); + ASSERT(!isnan(dist) && !isinf(dist)); + } + return dist; +} + +void Clusterer::initialize_clusters(int nb_points, scalar_t **points) { + int *used = new int[nb_points]; + for(int k = 0; k < nb_points; k++) { used[k] = 0; } + for(int k = 0; k < _nb_clusters; k++) { + int l; + do { l = int(drand48() * nb_points); } while(used[l]); + for(int d = 0; d < _dim; d++) { + _cluster_means[k][d] = points[l][d]; + _cluster_var[k][d] = 1.0; + } + used[l] = 1; + } + delete[] used; +} + scalar_t Clusterer::baseline_cluster_association(int nb_points, scalar_t **points, int nb_classes, int *labels, scalar_t **gamma) { @@ -43,14 +68,7 @@ scalar_t Clusterer::baseline_cluster_association(int nb_points, scalar_t **point for(int n = 0; n < nb_points; n++) { scalar_t lowest_dist = 0; for(int k = 0; k < _nb_clusters; k++) { - scalar_t dist = 0; - - for(int d = 0; d < _dim; d++) { - dist += sq(_cluster_means[k][d] - points[n][d]) / (2 * _cluster_var[k][d]); - dist += 0.5 * log(_cluster_var[k][d]); - ASSERT(!isnan(dist) && !isinf(dist)); - } - + scalar_t dist = distance_to_centroid(points[n], k); if(k == 0 || dist <= lowest_dist) { lowest_dist = dist; associated_clusters[n] = k; @@ -96,15 +114,7 @@ scalar_t Clusterer::baseline_lp_cluster_association(int nb_points, scalar_t **po for(int k = 1; k <= _nb_clusters; k++) { for(int n = 1; n <= nb_points; n++) { int i = n + nb_points * (k - 1); - - scalar_t dist = 0; - - for(int d = 0; d < _dim; d++) { - dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]); - dist += 0.5 * log(_cluster_var[k-1][d]); - } - - glp_set_obj_coef(lp, i, dist); + glp_set_obj_coef(lp, i, distance_to_centroid(points[n-1], k-1)); glp_set_col_bnds(lp, i, GLP_DB, 0.0, 1.0); } } @@ -167,18 +177,18 @@ scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t glp_set_prob_name(lp, "uninformative_lp_cluster_association"); glp_set_obj_dir(lp, GLP_MIN); - // We have one constraint per points and one per cluster/class + // We have one constraint per point and one per cluster/class glp_add_rows(lp, nb_points + _nb_clusters * nb_classes); // (A) For each point, the constraint is that the sum of its - // association coefficients will be equal to 1.0 + // association coefficients is equal to 1.0 for(int n = 1; n <= nb_points; n++) { glp_set_row_bnds(lp, n, GLP_FX, 1.0, 1.0); } - // (B) For each cluster and each class, the sum of the association + // (B) For each pair cluster/class, the sum of the association // coefficient to this cluster for this class is equal to the number // of sample of that class, divided by the number of clusters @@ -199,17 +209,18 @@ scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t for(int n = 1; n <= nb_points; n++) { int r = n + nb_points * (k - 1); - scalar_t dist = 0; + // scalar_t dist = 0; - for(int d = 0; d < _dim; d++) { - dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]); - dist += 0.5 * log(_cluster_var[k-1][d]); - } + // for(int d = 0; d < _dim; d++) { + // dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]); + // dist += 0.5 * log(_cluster_var[k-1][d]); + // } - // The LP weight on this association coefficient is the distance - // (normalized) of that sample to the centroid of that cluster + // The LP weight on this association coefficient for the global + // loss is the normalized distance of that sample to the + // centroid of that cluster - glp_set_obj_coef(lp, r, dist); + glp_set_obj_coef(lp, r, distance_to_centroid(points[n-1], k-1)); // And this association coefficient is in [0,1] @@ -310,21 +321,6 @@ void Clusterer::update_clusters(int nb_points, scalar_t **points, scalar_t **gam } } -void Clusterer::initialize_clusters(int nb_points, scalar_t **points) { - int *used = new int[nb_points]; - for(int k = 0; k < nb_points; k++) { used[k] = 0; } - for(int k = 0; k < _nb_clusters; k++) { - int l; - do { l = int(drand48() * nb_points); } while(used[l]); - for(int d = 0; d < _dim; d++) { - _cluster_means[k][d] = points[l][d]; - _cluster_var[k][d] = 1.0; - } - used[l] = 1; - } - delete[] used; -} - void Clusterer::train(int mode, int nb_clusters, int dim, int nb_points, scalar_t **points, diff --git a/clusterer.h b/clusterer.h index b45f8c3..065dd3f 100644 --- a/clusterer.h +++ b/clusterer.h @@ -39,6 +39,8 @@ public: int _dim; scalar_t **_cluster_means, **_cluster_var; + scalar_t distance_to_centroid(scalar_t *x, int k); + void initialize_clusters(int nb_points, scalar_t **points); // Does the standard hard k-mean association