--- /dev/null
+
+/*
+ * data-tool is a command line tool to do simple statistical
+ * processing on numerical data.
+ *
+ * Copyright (c) 2009 Francois Fleuret
+ * Written by Francois Fleuret <francois@fleuret.org>
+ *
+ * This file is part of data-tool.
+ *
+ * data-tool is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 3 as
+ * published by the Free Software Foundation.
+ *
+ * data-tool is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with data-tool. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <iostream>
+#include <cmath>
+#include <stdlib.h>
+#include <string.h>
+
+using namespace std;
+
+struct Couple {
+ int index;
+ double value;
+};
+
+int compare_couple(const void *a, const void *b) {
+ if(((Couple *) a)->value < ((Couple *) b)->value) return -1;
+ else if(((Couple *) a)->value > ((Couple *) b)->value) return 1;
+ else return 0;
+}
+
+double *inflate_array(double *x, int current_size, int new_size) {
+ double *xx = new double[new_size];
+ for(int n = 0; n < current_size; n++) xx[n] = x[n];
+ delete[] x;
+ return xx;
+}
+
+char *next_word(char *buffer, char *r, int buffer_size) {
+ char *s;
+ s = buffer;
+ if(r != NULL)
+ {
+ if(*r == '"') {
+ r++;
+ while((*r != '"') && (*r != '\0') &&
+ (s<buffer+buffer_size-1))
+ *s++ = *r++;
+ if(*r == '"') r++;
+ } else {
+ while((*r != '\r') && (*r != '\n') && (*r != '\0') &&
+ (*r != '\t') && (*r != ' ') && (*r != ',') &&
+ (s<buffer+buffer_size-1))
+ *s++ = *r++;
+ }
+
+ while((*r == ' ') || (*r == '\t') || (*r == ',')) r++;
+ if((*r == '\0') || (*r=='\r') || (*r=='\n')) r = NULL;
+ }
+ *s = '\0';
+ return r;
+}
+
+void check_opt(int argc, char **argv, int n_opt, int n, const char *help) {
+ if(n_opt + n >= argc) {
+ cerr << "ERROR: Missing argument for " << argv[n_opt] << ". Expecting " << help << "." << endl;
+ exit(1);
+ }
+}
+
+void print_help_and_exit(int e) {
+ cout << "Simple data processing tool. Written by Francois Fleuret." << endl
+ << endl
+ << "This application takes data from the standard input and prints" << endl
+ << "the result on the standard output. It expects either a list of" << endl
+ << "float values (to produce histograms, cumulative distribution functions" << endl
+ << "or the mean, variance, etc.) or a list of couples of values of the form" << endl
+ << "x y on each line (where the sign of x tells the class and y the parameter" << endl
+ << "value) to compute the ROC curve or the ROC curve surface.\n" << endl
+ << "The options are:" << endl
+ << " --help" << endl
+ << " --roc" << endl
+ << " --roc-surface" << endl
+ << " --normalize" << endl
+ << " --histo" << endl
+ << " --cumul" << endl
+ << " --misc" << endl
+ << " --auto-extrema" << endl
+ << " --xbounds <float: xmin> <float: xmax>" << endl
+ << " --ybounds <float: ymin> <float: ymax>" << endl
+ << " --nb-bins <int: number of bins>" << endl;
+ exit(e);
+}
+
+void check_single_processing(bool unknown_processing) {
+ if(!unknown_processing) {
+ cerr << "ERROR: You can't do two different processings." << endl;
+ exit(1);
+ }
+}
+
+int main(int argc, char **argv) {
+ double xmin = 0, xmax = 1, ymin = 0, ymax = 1;
+ int nb_bins = 10;
+ const int buffer_size = 1024;
+
+ char line[buffer_size], token[buffer_size];
+ bool auto_extrema = false;
+ bool normalize = false;
+
+ int i = 1;
+
+ enum { UNKNOWN, ROC, ROC_SURFACE, HISTO, CUMUL, MISC } processing = UNKNOWN;
+
+ // Parsing the command line arguments ////////////////////////////////
+
+ while(i < argc) {
+
+ if(argc == 1 || strcmp(argv[i], "--help") == 0) print_help_and_exit(0);
+
+ else if(strcmp(argv[i], "--roc") == 0) {
+ check_single_processing(processing == UNKNOWN);
+ processing = ROC;
+ i++;
+ }
+
+ else if(strcmp(argv[i], "--roc-surface") == 0) {
+ check_single_processing(processing == UNKNOWN);
+ processing = ROC_SURFACE;
+ i++;
+ }
+
+ else if(strcmp(argv[i], "--cumul") == 0) {
+ check_single_processing(processing == UNKNOWN);
+ processing = CUMUL;
+ i++;
+ }
+
+ else if(strcmp(argv[i], "--normalize") == 0) {
+ normalize = true;
+ i++;
+ }
+
+ else if(strcmp(argv[i], "--histo") == 0) {
+ check_single_processing(processing == UNKNOWN);
+ processing = HISTO;
+ i++;
+ }
+
+ else if(strcmp(argv[i], "--misc") == 0) {
+ check_single_processing(processing == UNKNOWN);
+ processing = MISC;
+ i++;
+ }
+
+ else if(strcmp(argv[i], "--auto-extrema") == 0) {
+ auto_extrema = true;
+ i++;
+ }
+
+ else if(strcmp(argv[i], "--xbounds") == 0) {
+ check_opt(argc, argv, i, 2, "<float: xmin> <float: xmax>");
+ xmin = atof(argv[i+1]);
+ xmax = atof(argv[i+2]);
+ if(xmin >= xmax) {
+ cerr << "ERROR: Incorrect bounds." << endl;
+ exit(1);
+ }
+ i += 3;
+ }
+
+ else if(strcmp(argv[i], "--ybounds") == 0) {
+ check_opt(argc, argv, i, 2, "<float: ymin> <float: ymax>");
+ ymin = atof(argv[i+1]);
+ ymax = atof(argv[i+2]);
+ if(ymin >= ymax) {
+ cerr << "ERROR: Incorrect bounds." << endl;
+ exit(1);
+ }
+ i += 3;
+ }
+
+ else if(strcmp(argv[i], "--nb-bins") == 0) {
+ check_opt(argc, argv, i, 1, "<int: number of bins>");
+ nb_bins = atoi(argv[i+1]);
+ if(nb_bins < 1) {
+ cerr << "ERROR: Incorrect number of bins." << endl;
+ exit(1);
+ }
+ i += 2;
+ }
+
+ else {
+ cerr << "ERROR: Unknown option " << argv[i] << endl;
+ print_help_and_exit(1);
+ }
+ }
+
+ // Processing the data ///////////////////////////////////////////////
+
+ switch(processing) {
+
+ case CUMUL:
+
+ {
+ int nb_samples = 0, nb_samples_max = 50000;
+ double *x = new double[nb_samples_max];
+
+ while(!cin.eof()) {
+ if(nb_samples == nb_samples_max) {
+ x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+ nb_samples_max = 2 * nb_samples_max;
+ }
+
+ cin.getline(line, buffer_size);
+
+ if(line[0]) {
+ char *s = line;
+ s = next_word(token, s, buffer_size);
+ x[nb_samples] = atof(token);
+ nb_samples++;
+ }
+ }
+
+ Couple tmp[nb_samples];
+ for(int n = 0; n < nb_samples; n++) {
+ tmp[n].index = n;
+ tmp[n].value = x[n];
+ }
+
+ qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
+
+ for(int n = 0; n < nb_samples; n++)
+ cout << tmp[n].value << " " << double(n)/double(nb_samples) << endl;
+
+ delete[] x;
+
+ }
+
+ break;
+
+ case ROC:
+ case ROC_SURFACE:
+
+ {
+ int nb_samples = 0, nb_samples_max = 1000;
+ double *x = new double[nb_samples_max], *y = new double[nb_samples_max];
+
+ while(!cin.eof()) {
+ if(nb_samples == nb_samples_max) {
+ x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+ y = inflate_array(y, nb_samples_max, 2 * nb_samples_max);
+ nb_samples_max = 2 * nb_samples_max;
+ }
+
+ cin.getline(line, buffer_size);
+
+ if(line[0]) {
+ char *s = line;
+ s = next_word(token, s, buffer_size);
+ x[nb_samples] = atof(token);
+ s = next_word(token, s, buffer_size);
+ y[nb_samples] = atof(token);
+ nb_samples++;
+ }
+ }
+
+ Couple tmp[nb_samples];
+ int nb_rn = 0, nb_rp = 0, nb_fp = 0, nb_fn = 0;
+
+ bool binary = true;
+ for(int n = 0; binary && n < nb_samples; n++) binary &= (x[n] == 0 || x[n] == 1);
+ if(binary) {
+ cerr << "WARNING: your classes are binary, I process them accordingly." << endl;
+ for(int n = 0; n < nb_samples; n++) x[n] = 2 * x[n] - 1;
+ }
+
+ for(int n = 0; n < nb_samples; n++) {
+ tmp[n].index = n;
+ tmp[n].value = y[n];
+ if(x[n] >= 0) nb_rp++;
+ else { nb_rn++; nb_fp++; }
+ }
+
+ if(nb_rp == 0) cerr << "WARNING: No true positive." << endl;
+ if(nb_rn == 0) cerr << "WARNING: No true negative." << endl;
+
+ qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
+
+ if(processing == ROC) {
+ for(int n = 0; n < nb_samples - 1; n++) {
+ if(x[tmp[n].index] >= 0) nb_fn++;
+ else nb_fp--;
+ if(tmp[n].value < tmp[n+1].value) {
+ cout << double(nb_fp)/double(nb_rn) << " "
+ << 1 - double(nb_fn) / double(nb_rp) << " "
+ << (tmp[n].value + tmp[n+1].value) << " "
+ << endl;
+ }
+ }
+ } else {
+ double surface = 0;
+ double cx = double(nb_fp)/double(nb_rn), cy = 1 - double(nb_fn) / double(nb_rp);
+ for(int n = 0; n < nb_samples - 1; n++) {
+ if(x[tmp[n].index] >= 0) nb_fn++;
+ else nb_fp--;
+ if(tmp[n].value < tmp[n+1].value) {
+ double ncx = double(nb_fp)/double(nb_rn), ncy = 1 - double(nb_fn) / double(nb_rp);
+ surface += (cx - ncx) * cy;
+ cx = ncx; cy = ncy;
+ }
+ }
+ cout << surface << endl;
+ }
+
+ delete[] x; delete[] y;
+
+ }
+
+ break;
+
+ case HISTO:
+
+ {
+ int nb_samples = 0, nb_samples_max = 1000;
+ double *x = new double[nb_samples_max];
+
+ while(!cin.eof()) {
+ if(nb_samples == nb_samples_max) {
+ x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+ nb_samples_max = 2 * nb_samples_max;
+ }
+
+ cin.getline(line, buffer_size);
+
+ if(line[0]) {
+ char *s = line;
+ s = next_word(token, s, buffer_size);
+ x[nb_samples] = atof(token);
+ if(auto_extrema) {
+ if(nb_samples == 0 || x[nb_samples] > xmax) xmax = x[nb_samples];
+ if(nb_samples == 0 || x[nb_samples] < xmin) xmin = x[nb_samples];
+ }
+ nb_samples++;
+ }
+ }
+
+ int nb[nb_bins];
+ for(int n = 0; n < nb_bins; n++) nb[n] = 0;
+
+ int nb_total = 0;
+ for(int s = 0; s < nb_samples; s++) {
+ int n = int((x[s] - xmin)/(xmax - xmin) * nb_bins);
+ if(n >= 0 && n < nb_bins) nb[n]++;
+ else {
+ cerr << "WARNING: value " << x[s] << " is out of histogram." << endl;
+ }
+ nb_total++;
+ }
+
+ if(normalize) {
+ for(int n = 0; n < nb_bins; n++)
+ cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " "
+ << (nb[n] / double(nb_total))/((xmax - xmin) / double(nb_bins)) << endl;
+ } else {
+ for(int n = 0; n < nb_bins; n++)
+ cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " "
+ << nb[n] / double(nb_total) << endl;
+ }
+ }
+
+ break;
+
+ case MISC:
+
+ {
+ int nb_samples = 0, nb_samples_max = 1000;
+ double *x = new double[nb_samples_max];
+ int nb = 0;
+ double min = 0, max = 0;
+ double sum = 0, sumsq = 0;
+
+ while(!cin.eof()) {
+ if(nb_samples == nb_samples_max) {
+ x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+ nb_samples_max = 2 * nb_samples_max;
+ }
+
+ cin.getline(line, buffer_size);
+ char *s = line;
+ if(line[0]) {
+ s = next_word(token, s, buffer_size);
+ x[nb_samples] = atof(token);
+ nb_samples++;
+ double x = atof(token);
+ if(nb == 0 || x > max) max = x;
+ if(nb == 0 || x < min) min = x;
+ sum += x;
+ sumsq += x*x;
+ nb++;
+ }
+ }
+
+ Couple tmp[nb_samples];
+ for(int n = 0; n < nb_samples; n++) {
+ tmp[n].index = n;
+ tmp[n].value = x[n];
+ }
+
+ qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
+
+ delete[] x;
+
+ double mu = sum / double(nb);
+ double sigma = (sumsq - sum * mu) / double(nb - 1);
+ double stdd = sqrt(sigma);
+
+ cout << "MIN " << min
+ << " MAX " << max
+ << " MU " << mu
+ << " SIGMA " << sigma
+ << " STDD " << stdd
+ << " SUM " << sum
+ << " MEDIAN " << tmp[nb_samples/2].value
+ << " QUANTILE0.1 " << tmp[int(nb_samples * 0.1)].value
+ << " QUANTILE0.9 " << tmp[int(nb_samples * 0.9)].value
+ << endl;
+
+ }
+
+ break;
+
+ default:
+ cerr << "ERROR: You must choose a processing type." << endl;
+ exit(1);
+ }
+
+}