From: Francois Fleuret <francois@fleuret.org>
Date: Mon, 30 Mar 2009 09:11:57 +0000 (+0200)
Subject: Initial commit.
X-Git-Url: https://fleuret.org/cgi-bin/gitweb/gitweb.cgi?a=commitdiff_plain;h=3201af50a66f14e327f41549c183871e837556c8;p=data-tool.git

Initial commit.
---

3201af50a66f14e327f41549c183871e837556c8
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..cdd5ce6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,50 @@
+
+#
+#  data-tool is a command line tool to do simple statistical
+#  processing on numerical data.
+#
+#  Copyright (c) 2009 Francois Fleuret
+#  Written by Francois Fleuret <francois.fleuret@idiap.ch>
+#
+#  This file is part of data-tool.
+#
+#  data-tool is free software: you can redistribute it and/or modify
+#  it under the terms of the GNU General Public License version 3 as
+#  published by the Free Software Foundation.
+#
+#  data-tool is distributed in the hope that it will be useful, but
+#  WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  General Public License for more details.
+#
+#  You should have received a copy of the GNU General Public License
+#  along with data-tool.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+ifeq ($(DEBUG),yes)
+  OPTIMIZE_FLAG = -ggdb3 -DDEBUG
+else
+  OPTIMIZE_FLAG = -ggdb3 -O3
+endif
+
+ifeq ($(PROFILE),yes)
+  PROFILE_FLAG = -pg
+endif
+
+CXXFLAGS = -Wall $(OPTIMIZE_FLAG) $(PROFILE_FLAG)
+
+all: data-tool
+
+TAGS: *.cc *.h
+	etags --members -l c++ *.cc *.h
+
+data-tool: data-tool.o
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS)
+
+Makefile.depend: *.h *.cc Makefile
+	$(CC) -M *.cc > Makefile.depend
+
+clean:
+	\rm -f data-tool *.o Makefile.depend TAGS
+
+-include Makefile.depend
diff --git a/data-tool.cc b/data-tool.cc
new file mode 100644
index 0000000..bcf19b3
--- /dev/null
+++ b/data-tool.cc
@@ -0,0 +1,449 @@
+
+/*
+ *  data-tool is a command line tool to do simple statistical
+ *  processing on numerical data.
+ *
+ *  Copyright (c) 2009 Francois Fleuret
+ *  Written by Francois Fleuret <francois@fleuret.org>
+ *
+ *  This file is part of data-tool.
+ *
+ *  data-tool is free software: you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 3 as
+ *  published by the Free Software Foundation.
+ *
+ *  data-tool is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with data-tool.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include <iostream>
+#include <cmath>
+#include <stdlib.h>
+#include <string.h>
+
+using namespace std;
+
+struct Couple {
+  int index;
+  double value;
+};
+
+int compare_couple(const void *a, const void *b) {
+  if(((Couple *) a)->value < ((Couple *) b)->value) return -1;
+  else if(((Couple *) a)->value > ((Couple *) b)->value) return 1;
+  else return 0;
+}
+
+double *inflate_array(double *x, int current_size, int new_size) {
+  double *xx = new double[new_size];
+  for(int n = 0; n < current_size; n++) xx[n] = x[n];
+  delete[] x;
+  return xx;
+}
+
+char *next_word(char *buffer, char *r, int buffer_size) {
+  char *s;
+  s = buffer;
+  if(r != NULL)
+    {
+      if(*r == '"') {
+        r++;
+        while((*r != '"') && (*r != '\0') &&
+              (s<buffer+buffer_size-1))
+          *s++ = *r++;
+        if(*r == '"') r++;
+      } else {
+        while((*r != '\r') && (*r != '\n') && (*r != '\0') &&
+              (*r != '\t') && (*r != ' ') && (*r != ',') &&
+              (s<buffer+buffer_size-1))
+          *s++ = *r++;
+      }
+
+      while((*r == ' ') || (*r == '\t') || (*r == ',')) r++;
+      if((*r == '\0') || (*r=='\r') || (*r=='\n')) r = NULL;
+    }
+  *s = '\0';
+  return r;
+}
+
+void check_opt(int argc, char **argv, int n_opt, int n, const char *help) {
+  if(n_opt + n >= argc) {
+    cerr << "ERROR: Missing argument for " << argv[n_opt] << ". Expecting " << help << "." << endl;
+    exit(1);
+  }
+}
+
+void print_help_and_exit(int e) {
+  cout << "Simple data processing tool. Written by Francois Fleuret." << endl
+       << endl
+       << "This application takes data from the standard input and prints" << endl
+       << "the result on the standard output. It expects either a list of" << endl
+       << "float values (to produce histograms, cumulative distribution functions" << endl
+       << "or the mean, variance, etc.) or a list of couples of values of the form" << endl
+       << "x y on each line (where the sign of x tells the class and y the parameter" << endl
+       << "value) to compute the ROC curve or the ROC curve surface.\n" << endl
+       << "The options are:" << endl
+       << "  --help" << endl
+       << "  --roc" << endl
+       << "  --roc-surface" << endl
+       << "  --normalize" << endl
+       << "  --histo" << endl
+       << "  --cumul" << endl
+       << "  --misc" << endl
+       << "  --auto-extrema" << endl
+       << "  --xbounds <float: xmin> <float: xmax>" << endl
+       << "  --ybounds <float: ymin> <float: ymax>" << endl
+       << "  --nb-bins <int: number of bins>" << endl;
+  exit(e);
+}
+
+void check_single_processing(bool unknown_processing) {
+  if(!unknown_processing) {
+    cerr << "ERROR: You can't do two different processings." << endl;
+    exit(1);
+  }
+}
+
+int main(int argc, char **argv) {
+  double xmin = 0, xmax = 1, ymin = 0, ymax = 1;
+  int nb_bins = 10;
+  const int buffer_size = 1024;
+
+  char line[buffer_size], token[buffer_size];
+  bool auto_extrema = false;
+  bool normalize = false;
+
+  int i = 1;
+
+  enum { UNKNOWN, ROC, ROC_SURFACE, HISTO, CUMUL, MISC } processing = UNKNOWN;
+
+  // Parsing the command line arguments ////////////////////////////////
+
+  while(i < argc) {
+
+    if(argc == 1 || strcmp(argv[i], "--help") == 0) print_help_and_exit(0);
+
+    else if(strcmp(argv[i], "--roc") == 0) {
+      check_single_processing(processing == UNKNOWN);
+      processing = ROC;
+      i++;
+    }
+
+    else if(strcmp(argv[i], "--roc-surface") == 0) {
+      check_single_processing(processing == UNKNOWN);
+      processing = ROC_SURFACE;
+      i++;
+    }
+
+    else if(strcmp(argv[i], "--cumul") == 0) {
+      check_single_processing(processing == UNKNOWN);
+      processing = CUMUL;
+      i++;
+    }
+
+    else if(strcmp(argv[i], "--normalize") == 0) {
+      normalize = true;
+      i++;
+    }
+
+    else if(strcmp(argv[i], "--histo") == 0) {
+      check_single_processing(processing == UNKNOWN);
+      processing = HISTO;
+      i++;
+    }
+
+    else if(strcmp(argv[i], "--misc") == 0) {
+      check_single_processing(processing == UNKNOWN);
+      processing = MISC;
+      i++;
+    }
+
+    else if(strcmp(argv[i], "--auto-extrema") == 0) {
+      auto_extrema = true;
+      i++;
+    }
+
+    else if(strcmp(argv[i], "--xbounds") == 0) {
+      check_opt(argc, argv, i, 2, "<float: xmin> <float: xmax>");
+      xmin = atof(argv[i+1]);
+      xmax = atof(argv[i+2]);
+      if(xmin >= xmax) {
+        cerr << "ERROR: Incorrect bounds." << endl;
+        exit(1);
+      }
+      i += 3;
+    }
+
+    else if(strcmp(argv[i], "--ybounds") == 0) {
+      check_opt(argc, argv, i, 2, "<float: ymin> <float: ymax>");
+      ymin = atof(argv[i+1]);
+      ymax = atof(argv[i+2]);
+      if(ymin >= ymax) {
+        cerr << "ERROR: Incorrect bounds." << endl;
+        exit(1);
+      }
+      i += 3;
+    }
+
+    else if(strcmp(argv[i], "--nb-bins") == 0) {
+      check_opt(argc, argv, i, 1, "<int: number of bins>");
+      nb_bins = atoi(argv[i+1]);
+      if(nb_bins < 1) {
+        cerr << "ERROR: Incorrect number of bins." << endl;
+        exit(1);
+      }
+      i += 2;
+    }
+
+    else {
+      cerr << "ERROR: Unknown option " << argv[i]  << endl;
+      print_help_and_exit(1);
+    }
+  }
+
+  // Processing the data ///////////////////////////////////////////////
+
+  switch(processing) {
+
+  case CUMUL:
+
+    {
+      int nb_samples = 0, nb_samples_max = 50000;
+      double *x = new double[nb_samples_max];
+
+      while(!cin.eof()) {
+        if(nb_samples == nb_samples_max) {
+          x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+          nb_samples_max = 2 * nb_samples_max;
+        }
+
+        cin.getline(line, buffer_size);
+
+        if(line[0]) {
+          char *s = line;
+          s = next_word(token, s, buffer_size);
+          x[nb_samples] = atof(token);
+          nb_samples++;
+        }
+      }
+
+      Couple tmp[nb_samples];
+      for(int n = 0; n < nb_samples; n++) {
+        tmp[n].index = n;
+        tmp[n].value = x[n];
+      }
+
+      qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
+
+      for(int n = 0; n < nb_samples; n++)
+        cout << tmp[n].value << " " << double(n)/double(nb_samples)  << endl;
+
+      delete[] x;
+
+    }
+
+    break;
+
+  case ROC:
+  case ROC_SURFACE:
+
+    {
+      int nb_samples = 0, nb_samples_max = 1000;
+      double *x = new double[nb_samples_max], *y = new double[nb_samples_max];
+
+      while(!cin.eof()) {
+        if(nb_samples == nb_samples_max) {
+          x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+          y = inflate_array(y, nb_samples_max, 2 * nb_samples_max);
+          nb_samples_max = 2 * nb_samples_max;
+        }
+
+        cin.getline(line, buffer_size);
+
+        if(line[0]) {
+          char *s = line;
+          s = next_word(token, s, buffer_size);
+          x[nb_samples] = atof(token);
+          s = next_word(token, s, buffer_size);
+          y[nb_samples] = atof(token);
+          nb_samples++;
+        }
+      }
+
+      Couple tmp[nb_samples];
+      int nb_rn = 0, nb_rp = 0, nb_fp = 0, nb_fn = 0;
+
+      bool binary = true;
+      for(int n = 0; binary && n < nb_samples; n++) binary &= (x[n] == 0 || x[n] == 1);
+      if(binary) {
+        cerr << "WARNING: your classes are binary, I process them accordingly." << endl;
+        for(int n = 0; n < nb_samples; n++) x[n] = 2 * x[n] - 1;
+      }
+
+      for(int n = 0; n < nb_samples; n++) {
+        tmp[n].index = n;
+        tmp[n].value = y[n];
+        if(x[n] >= 0) nb_rp++;
+        else { nb_rn++; nb_fp++; }
+      }
+
+      if(nb_rp == 0) cerr << "WARNING: No true positive." << endl;
+      if(nb_rn == 0) cerr << "WARNING: No true negative." << endl;
+
+      qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
+
+      if(processing == ROC) {
+        for(int n = 0; n < nb_samples - 1; n++) {
+          if(x[tmp[n].index] >= 0) nb_fn++;
+          else                     nb_fp--;
+          if(tmp[n].value < tmp[n+1].value) {
+            cout << double(nb_fp)/double(nb_rn) << " "
+                 << 1 - double(nb_fn) / double(nb_rp) << " "
+                 << (tmp[n].value + tmp[n+1].value) << " "
+                 << endl;
+          }
+        }
+      } else {
+        double surface = 0;
+        double cx = double(nb_fp)/double(nb_rn), cy = 1 - double(nb_fn) / double(nb_rp);
+        for(int n = 0; n < nb_samples - 1; n++) {
+          if(x[tmp[n].index] >= 0) nb_fn++;
+          else                     nb_fp--;
+          if(tmp[n].value < tmp[n+1].value) {
+            double ncx = double(nb_fp)/double(nb_rn), ncy = 1 - double(nb_fn) / double(nb_rp);
+            surface += (cx - ncx) * cy;
+            cx = ncx; cy = ncy;
+          }
+        }
+        cout << surface  << endl;
+      }
+
+      delete[] x; delete[] y;
+
+    }
+
+    break;
+
+  case HISTO:
+
+    {
+      int nb_samples = 0, nb_samples_max = 1000;
+      double *x = new double[nb_samples_max];
+
+      while(!cin.eof()) {
+        if(nb_samples == nb_samples_max) {
+          x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+          nb_samples_max = 2 * nb_samples_max;
+        }
+
+        cin.getline(line, buffer_size);
+
+        if(line[0]) {
+          char *s = line;
+          s = next_word(token, s, buffer_size);
+          x[nb_samples] = atof(token);
+          if(auto_extrema) {
+            if(nb_samples == 0 || x[nb_samples] > xmax) xmax = x[nb_samples];
+            if(nb_samples == 0 || x[nb_samples] < xmin) xmin = x[nb_samples];
+          }
+          nb_samples++;
+        }
+      }
+
+      int nb[nb_bins];
+      for(int n = 0; n < nb_bins; n++) nb[n] = 0;
+
+      int nb_total = 0;
+      for(int s = 0; s < nb_samples; s++) {
+        int n = int((x[s] - xmin)/(xmax - xmin) * nb_bins);
+        if(n >= 0 && n < nb_bins) nb[n]++;
+        else {
+          cerr << "WARNING: value " << x[s] << " is out of histogram." << endl;
+        }
+        nb_total++;
+      }
+
+      if(normalize) {
+        for(int n = 0; n < nb_bins; n++)
+          cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " "
+               << (nb[n] / double(nb_total))/((xmax - xmin) / double(nb_bins))  << endl;
+      } else {
+        for(int n = 0; n < nb_bins; n++)
+          cout << xmin + ((xmax - xmin) * n) / double(nb_bins) << " "
+               << nb[n] / double(nb_total)  << endl;
+      }
+    }
+
+    break;
+
+  case MISC:
+
+    {
+      int nb_samples = 0, nb_samples_max = 1000;
+      double *x = new double[nb_samples_max];
+      int nb = 0;
+      double min = 0, max = 0;
+      double sum = 0, sumsq = 0;
+
+      while(!cin.eof()) {
+        if(nb_samples == nb_samples_max) {
+          x = inflate_array(x, nb_samples_max, 2 * nb_samples_max);
+          nb_samples_max = 2 * nb_samples_max;
+        }
+
+        cin.getline(line, buffer_size);
+        char *s = line;
+        if(line[0]) {
+          s = next_word(token, s, buffer_size);
+          x[nb_samples] = atof(token);
+          nb_samples++;
+          double x = atof(token);
+          if(nb == 0 || x > max) max = x;
+          if(nb == 0 || x < min) min = x;
+          sum += x;
+          sumsq += x*x;
+          nb++;
+        }
+      }
+
+      Couple tmp[nb_samples];
+      for(int n = 0; n < nb_samples; n++) {
+        tmp[n].index = n;
+        tmp[n].value = x[n];
+      }
+
+      qsort(tmp, nb_samples, sizeof(Couple), compare_couple);
+
+      delete[] x;
+
+      double mu = sum / double(nb);
+      double sigma = (sumsq - sum * mu) / double(nb - 1);
+      double stdd = sqrt(sigma);
+
+      cout << "MIN " << min
+           << " MAX " << max
+           << " MU " << mu
+           << " SIGMA " << sigma
+           << " STDD " << stdd
+           << " SUM " << sum
+           << " MEDIAN " << tmp[nb_samples/2].value
+           << " QUANTILE0.1 " << tmp[int(nb_samples * 0.1)].value
+           << " QUANTILE0.9 " << tmp[int(nb_samples * 0.9)].value
+           << endl;
+
+    }
+
+    break;
+
+  default:
+    cerr << "ERROR: You must choose a processing type." << endl;
+    exit(1);
+  }
+
+}