From e83a6c8bc9daf4e36a82796cd7ca4ed7f6d686da Mon Sep 17 00:00:00 2001
From: Francois Fleuret <francois@fleuret.org>
Date: Wed, 27 Mar 2013 13:04:56 +0100
Subject: [PATCH] Added distance_to_centroid.

---
 clusterer.cc | 82 +++++++++++++++++++++++++---------------------------
 clusterer.h  |  2 ++
 2 files changed, 41 insertions(+), 43 deletions(-)

diff --git a/clusterer.cc b/clusterer.cc
index 9c5e7cb..3c33f3c 100644
--- a/clusterer.cc
+++ b/clusterer.cc
@@ -34,6 +34,31 @@ Clusterer::~Clusterer() {
   deallocate_array<scalar_t>(_cluster_var);
 }
 
+scalar_t Clusterer::distance_to_centroid(scalar_t *x, int k) {
+  scalar_t dist = 0;
+  for(int d = 0; d < _dim; d++) {
+    dist += sq(_cluster_means[k][d] - x[d]) / (2 * _cluster_var[k][d]);
+    dist += 0.5 * log(_cluster_var[k][d]);
+    ASSERT(!isnan(dist) && !isinf(dist));
+  }
+  return dist;
+}
+
+void Clusterer::initialize_clusters(int nb_points, scalar_t **points) {
+  int *used = new int[nb_points];
+  for(int k = 0; k < nb_points; k++) { used[k] = 0; }
+  for(int k = 0; k < _nb_clusters; k++) {
+    int l;
+    do { l = int(drand48() * nb_points); } while(used[l]);
+    for(int d = 0; d < _dim; d++) {
+      _cluster_means[k][d] = points[l][d];
+      _cluster_var[k][d] = 1.0;
+    }
+    used[l] = 1;
+  }
+  delete[] used;
+}
+
 scalar_t Clusterer::baseline_cluster_association(int nb_points, scalar_t **points,
                                                  int nb_classes, int *labels,
                                                  scalar_t **gamma)  {
@@ -43,14 +68,7 @@ scalar_t Clusterer::baseline_cluster_association(int nb_points, scalar_t **point
   for(int n = 0; n < nb_points; n++) {
     scalar_t lowest_dist = 0;
     for(int k = 0; k < _nb_clusters; k++) {
-      scalar_t dist = 0;
-
-      for(int d = 0; d < _dim; d++) {
-        dist += sq(_cluster_means[k][d] - points[n][d]) / (2 * _cluster_var[k][d]);
-        dist += 0.5 * log(_cluster_var[k][d]);
-        ASSERT(!isnan(dist) && !isinf(dist));
-      }
-
+      scalar_t dist = distance_to_centroid(points[n], k);
       if(k == 0 || dist <= lowest_dist) {
         lowest_dist = dist;
         associated_clusters[n] = k;
@@ -96,15 +114,7 @@ scalar_t Clusterer::baseline_lp_cluster_association(int nb_points, scalar_t **po
   for(int k = 1; k <= _nb_clusters; k++) {
     for(int n = 1; n <= nb_points; n++) {
       int i = n + nb_points * (k - 1);
-
-      scalar_t dist = 0;
-
-      for(int d = 0; d < _dim; d++) {
-        dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]);
-        dist += 0.5 * log(_cluster_var[k-1][d]);
-      }
-
-      glp_set_obj_coef(lp, i, dist);
+      glp_set_obj_coef(lp, i, distance_to_centroid(points[n-1], k-1));
       glp_set_col_bnds(lp, i, GLP_DB, 0.0, 1.0);
     }
   }
@@ -167,18 +177,18 @@ scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t
   glp_set_prob_name(lp, "uninformative_lp_cluster_association");
   glp_set_obj_dir(lp, GLP_MIN);
 
-  // We have one constraint per points and one per cluster/class
+  // We have one constraint per point and one per cluster/class
 
   glp_add_rows(lp, nb_points + _nb_clusters * nb_classes);
 
   // (A) For each point, the constraint is that the sum of its
-  // association coefficients will be equal to 1.0
+  // association coefficients is equal to 1.0
 
   for(int n = 1; n <= nb_points; n++) {
     glp_set_row_bnds(lp, n, GLP_FX, 1.0, 1.0);
   }
 
-  // (B) For each cluster and each class, the sum of the association
+  // (B) For each pair cluster/class, the sum of the association
   // coefficient to this cluster for this class is equal to the number
   // of sample of that class, divided by the number of clusters
 
@@ -199,17 +209,18 @@ scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t
     for(int n = 1; n <= nb_points; n++) {
       int r = n + nb_points * (k - 1);
 
-      scalar_t dist = 0;
+      // scalar_t dist = 0;
 
-      for(int d = 0; d < _dim; d++) {
-        dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]);
-        dist += 0.5 * log(_cluster_var[k-1][d]);
-      }
+      // for(int d = 0; d < _dim; d++) {
+        // dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]);
+        // dist += 0.5 * log(_cluster_var[k-1][d]);
+      // }
 
-      // The LP weight on this association coefficient is the distance
-      // (normalized) of that sample to the centroid of that cluster
+      // The LP weight on this association coefficient for the global
+      // loss is the normalized distance of that sample to the
+      // centroid of that cluster
 
-      glp_set_obj_coef(lp, r, dist);
+      glp_set_obj_coef(lp, r, distance_to_centroid(points[n-1], k-1));
 
       // And this association coefficient is in [0,1]
 
@@ -310,21 +321,6 @@ void Clusterer::update_clusters(int nb_points, scalar_t **points, scalar_t **gam
   }
 }
 
-void Clusterer::initialize_clusters(int nb_points, scalar_t **points) {
-  int *used = new int[nb_points];
-  for(int k = 0; k < nb_points; k++) { used[k] = 0; }
-  for(int k = 0; k < _nb_clusters; k++) {
-    int l;
-    do { l = int(drand48() * nb_points); } while(used[l]);
-    for(int d = 0; d < _dim; d++) {
-      _cluster_means[k][d] = points[l][d];
-      _cluster_var[k][d] = 1.0;
-    }
-    used[l] = 1;
-  }
-  delete[] used;
-}
-
 void Clusterer::train(int mode,
                       int nb_clusters, int dim,
                       int nb_points, scalar_t **points,
diff --git a/clusterer.h b/clusterer.h
index b45f8c3..065dd3f 100644
--- a/clusterer.h
+++ b/clusterer.h
@@ -39,6 +39,8 @@ public:
   int _dim;
   scalar_t **_cluster_means, **_cluster_var;
 
+  scalar_t distance_to_centroid(scalar_t *x, int k);
+
   void initialize_clusters(int nb_points, scalar_t **points);
 
   // Does the standard hard k-mean association
-- 
2.20.1