X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?p=clueless-kmeans.git;a=blobdiff_plain;f=clusterer.cc;fp=clusterer.cc;h=04a9af4eb35f243effc4192897ce5e9a5373eedb;hp=3c33f3cf0fe33c798e678c3384f1ffa6407393e7;hb=04d2b44ba34a811e1fab0b90d38ebd06cd918c52;hpb=ca6d045155d4c948063f49b6de8c35c0e3246e7a diff --git a/clusterer.cc b/clusterer.cc index 3c33f3c..04a9af4 100644 --- a/clusterer.cc +++ b/clusterer.cc @@ -35,6 +35,8 @@ Clusterer::~Clusterer() { } scalar_t Clusterer::distance_to_centroid(scalar_t *x, int k) { + // We take the variance into account + the normalization term. This + // is between k-mean and EM with a diagonal covariance scalar_t dist = 0; for(int d = 0; d < _dim; d++) { dist += sq(_cluster_means[k][d] - x[d]) / (2 * _cluster_var[k][d]); @@ -185,7 +187,8 @@ scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t // association coefficients is equal to 1.0 for(int n = 1; n <= nb_points; n++) { - glp_set_row_bnds(lp, n, GLP_FX, 1.0, 1.0); + int row = n; + glp_set_row_bnds(lp, row, GLP_FX, 1.0, 1.0); } // (B) For each pair cluster/class, the sum of the association @@ -207,24 +210,17 @@ scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t for(int k = 1; k <= _nb_clusters; k++) { for(int n = 1; n <= nb_points; n++) { - int r = n + nb_points * (k - 1); - - // scalar_t dist = 0; - - // for(int d = 0; d < _dim; d++) { - // dist += sq(_cluster_means[k-1][d] - points[n-1][d]) / (2 * _cluster_var[k-1][d]); - // dist += 0.5 * log(_cluster_var[k-1][d]); - // } + int col = n + nb_points * (k - 1); // The LP weight on this association coefficient for the global // loss is the normalized distance of that sample to the // centroid of that cluster - glp_set_obj_coef(lp, r, distance_to_centroid(points[n-1], k-1)); + glp_set_obj_coef(lp, col, distance_to_centroid(points[n-1], k-1)); - // And this association coefficient is in [0,1] + // And all the association coefficient is in [0,1] - glp_set_col_bnds(lp, r, GLP_DB, 0.0, 1.0); + glp_set_col_bnds(lp, col, GLP_DB, 0.0, 1.0); } } @@ -311,11 +307,13 @@ void Clusterer::update_clusters(int nb_points, scalar_t **points, scalar_t **gam for(int d = 0; d < _dim; d++) { if(sum_gamma >= 2) { - _cluster_var[k][d] = (_cluster_var[k][d] - sq(_cluster_means[k][d]) / sum_gamma) / (sum_gamma - 1); + _cluster_var[k][d] = + (_cluster_var[k][d] - sq(_cluster_means[k][d]) / sum_gamma) / (sum_gamma - 1); + _cluster_var[k][d] = max(scalar_t(min_cluster_variance), _cluster_var[k][d]); } else { _cluster_var[k][d] = 1; } - _cluster_var[k][d] = max(0.01, _cluster_var[k][d]); + _cluster_means[k][d] /= sum_gamma; } }