scalar_t Clusterer::uninformative_lp_cluster_association(int nb_points, scalar_t **points,
int nb_classes, int *labels,
- scalar_t **gamma) {
+ scalar_t **gamma,
+ int absolute_proportion) {
// N points
// K clusters
// dist(n,k) distance of samples n to cluster k
// under
//
// (A) \forall n, k, \gamma(n, k) >= 0
- // (B) \forall n, \sum_k \gamma(n,k) = 1
- // (C) \forall k, \sum_n \gamma(n,k) = N/K
+ // (B) \forall n, \sum_k \gamma(n, k) = 1
+ // (C) \forall k, \sum_n \gamma(n, k) = N/K
glp_prob *lp;
// ** GLPK USES INDEXES STARTING AT 1, NOT 0. **
- int nb_coeffs = nb_points * _nb_clusters + nb_points * _nb_clusters;
+ int nb_coeffs;
+
+ if(absolute_proportion) {
+ nb_coeffs = nb_points * _nb_clusters + nb_points * _nb_clusters;
+ } else {
+ nb_coeffs = nb_points * _nb_clusters + nb_points * nb_classes * _nb_clusters;
+ }
int *coeff_row = new int[nb_coeffs + 1];
int *coeff_col = new int[nb_coeffs + 1];
// gammas for this cluster and this class is equal to the number of
// sample of that class, divided by the number of clusters
- for(int k = 1; k <= _nb_clusters; k++) {
- for(int c = 1; c <= nb_classes; c++) {
- int row = nb_points + (k - 1) * nb_classes + c;
- scalar_t tau = nb_samples_per_class[c-1] / scalar_t(_nb_clusters);
- glp_set_row_bnds(lp, row, GLP_FX, tau, tau);
- for(int n = 1; n <= nb_points; n++) {
- if(labels[n-1] == c - 1) {
+ if(absolute_proportion) {
+ for(int k = 1; k <= _nb_clusters; k++) {
+ for(int c = 1; c <= nb_classes; c++) {
+ int row = nb_points + (k - 1) * nb_classes + c;
+ scalar_t tau = nb_samples_per_class[c-1] / scalar_t(_nb_clusters);
+ glp_set_row_bnds(lp, row, GLP_FX, tau, tau);
+ for(int n = 1; n <= nb_points; n++) {
+ if(labels[n-1] == c - 1) {
+ coeff_row[n_coeff] = row;
+ coeff_col[n_coeff] = (k-1) * nb_points + n;
+ coeff_wgt[n_coeff] = 1.0;
+ n_coeff++;
+ }
+ }
+ }
+ }
+ } else {
+ for(int k = 1; k <= _nb_clusters; k++) {
+ for(int c = 1; c <= nb_classes; c++) {
+ int row = nb_points + (k - 1) * nb_classes + c;
+ glp_set_row_bnds(lp, row, GLP_FX, 0.0, 0.0);
+ for(int n = 1; n <= nb_points; n++) {
coeff_row[n_coeff] = row;
coeff_col[n_coeff] = (k-1) * nb_points + n;
- coeff_wgt[n_coeff] = 1.0;
+ coeff_wgt[n_coeff] =
+ (labels[n-1] == c - 1 ? 1.0 : 0.0)
+ - scalar_t(nb_samples_per_class[c-1]) / scalar_t(nb_points);
n_coeff++;
}
}
switch(mode) {
case STANDARD_ASSOCIATION:
- total_distance =
- baseline_cluster_association(nb_points, points, nb_classes, labels, gammas);
+ total_distance =
+ baseline_cluster_association(nb_points, points, nb_classes, labels, gammas);
break;
case STANDARD_LP_ASSOCIATION:
- total_distance =
- baseline_lp_cluster_association(nb_points, points, nb_classes, labels, gammas);
+ total_distance =
+ baseline_lp_cluster_association(nb_points, points, nb_classes, labels, gammas);
break;
case UNINFORMATIVE_LP_ASSOCIATION:
- total_distance =
- uninformative_lp_cluster_association(nb_points, points, nb_classes, labels, gammas);
+ total_distance =
+ uninformative_lp_cluster_association(nb_points, points, nb_classes, labels, gammas, 0);
+ break;
+
+ case UNINFORMATIVE_LP_ASSOCIATION_ABSOLUTE:
+ total_distance =
+ uninformative_lp_cluster_association(nb_points, points, nb_classes, labels, gammas, 1);
break;
default:
public:
enum {
+ // Standard k-mean
STANDARD_ASSOCIATION,
+ // Same, implemented as a LP problem for sanity check
STANDARD_LP_ASSOCIATION,
- UNINFORMATIVE_LP_ASSOCIATION
+ // Criterion forcing to have the same distribution of classes in
+ // all clusters
+ UNINFORMATIVE_LP_ASSOCIATION,
+ // Criterion forcing to have the same number of samples of each
+ // class in all clusters
+ UNINFORMATIVE_LP_ASSOCIATION_ABSOLUTE
};
const static int max_nb_iterations = 10;
scalar_t uninformative_lp_cluster_association(int nb_points, scalar_t **points,
int nb_classes, int *labels,
- scalar_t **gamma);
+ scalar_t **gamma,
+ int absolute_proportion);
void update_clusters(int nb_points, scalar_t **points, scalar_t **gamma);