Starting to convert the ID from strings to integers to speed things up.
[mymail.git] / mymail.c
index 2617d02..dcb44bf 100644 (file)
--- a/mymail.c
+++ b/mymail.c
 #include <dirent.h>
 #include <regex.h>
 
+#define MYMAIL_DB_MAGIC_TOKEN "mymail_index_file"
 #define VERSION "0.1"
 
+#define MAX_NB_SEARCH_PATTERNS 10
+
 #define BUFFER_SIZE 65536
 
+enum {
+  ID_MAIL,
+  ID_FROM,
+  ID_DEST,
+  ID_SUBJECT,
+  ID_FROMDEST,
+  MAX_ID
+};
+
+static char *field_names[] = {
+  "mail",
+  "from",
+  "dest",
+  "subj",
+  "fromdest"
+};
+
 struct parsable_field {
-  char *name;
+  int id;
   char *regexp_string;
   regex_t regexp;
 };
 
 char *db_filename;
-char *search_pattern;
+char *db_root_path;
 
 int paranoid;
 int action_index;
@@ -80,7 +100,7 @@ void *safe_malloc(size_t n) {
   void *p = malloc(n);
   if(!p && n != 0) {
     fprintf(stderr,
-            "mymail: can not allocate memory: %s\n", strerror(errno));
+            "mymail: cannot allocate memory: %s\n", strerror(errno));
     exit(EXIT_FAILURE);
   }
   return p;
@@ -96,7 +116,7 @@ void print_usage(FILE *out) {
   print_version(out);
   fprintf(out, "Written by Francois Fleuret <francois@fleuret.org>.\n");
   fprintf(out, "\n");
-  fprintf(out, "Usage: mymail [options] [<filename1> [<filename2> ...]]\n");
+  fprintf(out, "Usage: mymail [options] [<mbox dir1> [<mbox dir2> ...]]\n");
   fprintf(out, "\n");
   fprintf(out, " -h, --help\n");
   fprintf(out, "         show this help\n");
@@ -104,102 +124,215 @@ void print_usage(FILE *out) {
   fprintf(out, "         print the version number\n");
   fprintf(out, " -i, --index\n");
   fprintf(out, "         index mails\n");
-  fprintf(out, " -d <db filename>, --db-file <db filename>\n");
-  fprintf(out, "         set the data-base file\n");
   fprintf(out, " -s <search pattern>, --search <search pattern>\n");
   fprintf(out, "         search for matching mails in the data-base file\n");
+  fprintf(out, " -d <db filename>, --db-file <db filename>\n");
+  fprintf(out, "         set the data-base file\n");
+  fprintf(out, " -r <db root path>, --db-root <db root path>\n");
+  fprintf(out, "         set the data-base root path for recursive search\n");
 }
 
 /*********************************************************************/
 
-void search_in_db(const char *search_name, const char *search_regexp_string,
+int ignore_entry(const char *name) {
+  return
+    /* strcmp(name, ".") == 0 || */
+    /* strcmp(name, "..") == 0 || */
+    (name[0] == '.' && name[1] != '/');
+}
+
+int mbox_line_match_search(int search_id, regex_t *search_regexp,
+                           int mbox_id, char *mbox_value) {
+  return search_id == mbox_id && regexec(search_regexp, mbox_value, 0, 0, 0) == 0;
+}
+
+void search_in_db(int nb_search_patterns,
+                  int *search_ids, char **search_regexp_strings,
                   FILE *db_file) {
-  char raw_line[BUFFER_SIZE];
-  char current_mail_filename[BUFFER_SIZE];
+  int hits[MAX_NB_SEARCH_PATTERNS];
+  char raw_db_line[BUFFER_SIZE];
+  char raw_mbox_line[BUFFER_SIZE];
+  char current_mail_filename[PATH_MAX + 1];
   unsigned long int current_position_in_mail;
-  char *name, *value;
-  regex_t regexp;
-  int already_written;
-
-  if(regcomp(&regexp,
-             search_regexp_string,
-             REG_ICASE)) {
-    fprintf(stderr,
-            "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
-            search_regexp_string,
-            search_name);
-    exit(EXIT_FAILURE);
+  char *mbox_name, *mbox_value;
+  int mbox_id;
+  regex_t search_regexps[MAX_NB_SEARCH_PATTERNS];
+  int already_written, m, n;
+
+  for(n = 0; n < nb_search_patterns; n++) {
+    if(regcomp(&search_regexps[n],
+               search_regexp_strings[n],
+               REG_ICASE)) {
+      fprintf(stderr,
+              "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
+              search_regexp_strings[n],
+              field_names[search_ids[n]]);
+      exit(EXIT_FAILURE);
+    }
   }
 
   current_position_in_mail = 0;
   already_written = 0;
 
-  while(fgets(raw_line, BUFFER_SIZE, db_file)) {
-    name = raw_line;
-    value = segment_next_field(raw_line);
+  for(n = 0; n < nb_search_patterns; n++) { hits[n] = 0; }
 
-    if(strcmp("mail", name) == 0) {
-      char *position_in_file_string = value;
-      char *mail_filename = segment_next_field(value);
-      current_position_in_mail = atol(position_in_file_string);
-      strcpy(current_mail_filename, mail_filename);
-      remove_eof(current_mail_filename);
-      already_written = 0;
-    }
+  while(fgets(raw_db_line, BUFFER_SIZE, db_file)) {
+    mbox_name = raw_db_line;
+    mbox_value = segment_next_field(raw_db_line);
 
-    else if(!already_written) {
-      if(strcmp(search_name, name) == 0 && regexec(&regexp, value, 0, 0, 0) == 0) {
+    if(strcmp("mail", mbox_name) == 0) {
+      char *position_in_file_string;
+      char *mail_filename;
+
+      for(n = 0; n < nb_search_patterns && hits[n]; n++);
+
+      if(n == nb_search_patterns) {
         FILE *mail_file;
+
         mail_file = fopen(current_mail_filename, "r");
+
         if(!mail_file) {
-          fprintf(stderr, "mymail: Can not open '%s'.\n", current_mail_filename);
+          fprintf(stderr, "mymail: Cannot open mbox '%s'.\n", current_mail_filename);
           exit(EXIT_FAILURE);
         }
+
         fseek(mail_file, current_position_in_mail, SEEK_SET);
-        if(fgets(raw_line, BUFFER_SIZE, mail_file)) {
-          printf("%s", raw_line);
-          while(fgets(raw_line, BUFFER_SIZE, mail_file) &&
-                strncmp(raw_line, "From ", 5)) {
-            printf("%s", raw_line);
+
+        if(fgets(raw_mbox_line, BUFFER_SIZE, mail_file)) {
+          printf("%s", raw_mbox_line);
+          while(fgets(raw_mbox_line, BUFFER_SIZE, mail_file) &&
+                strncmp(raw_mbox_line, "From ", 5)) {
+            printf("%s", raw_mbox_line);
           }
         }
+
         fclose(mail_file);
-        already_written = 1;
+      }
+
+      for(n = 0; n < nb_search_patterns; n++) { hits[n] = 0; }
+
+      position_in_file_string = mbox_value;
+      mail_filename = segment_next_field(mbox_value);
+      current_position_in_mail = atol(position_in_file_string);
+      strcpy(current_mail_filename, mail_filename);
+
+      remove_eof(current_mail_filename);
+      already_written = 0;
+    }
+
+    else {
+      mbox_id = -1;
+      for(m = 0; (m < MAX_ID) && mbox_id == -1; m++) {
+        if(strncmp(field_names[m], mbox_name, strlen(mbox_name)) == 0) {
+          mbox_id = m;
+        }
+      }
+      for(n = 0; n < nb_search_patterns; n++) {
+        hits[n] |= mbox_line_match_search(search_ids[n], &search_regexps[n],
+                                          mbox_id, mbox_value);
       }
     }
   }
 
-  regfree(&regexp);
+  for(n = 0; n < nb_search_patterns; n++) {
+    regfree(&search_regexps[n]);
+  }
+}
+
+void recursive_search_in_db(const char *entry_name,
+                            int nb_search_patterns,
+                            int *search_ids, char **search_regexp_strings) {
+  DIR *dir;
+  struct dirent *dir_e;
+  struct stat sb;
+  char raw_db_line[BUFFER_SIZE];
+  char subname[PATH_MAX + 1];
+
+  if(lstat(entry_name, &sb) != 0) {
+    fprintf(stderr,
+            "mymail: Cannot stat \"%s\": %s\n",
+            entry_name,
+            strerror(errno));
+    exit(EXIT_FAILURE);
+  }
+
+  dir = opendir(entry_name);
+
+  if(dir) {
+    while((dir_e = readdir(dir))) {
+      if(!ignore_entry(dir_e->d_name)) {
+        snprintf(subname, PATH_MAX, "%s/%s", entry_name, dir_e->d_name);
+        recursive_search_in_db(subname,
+                               nb_search_patterns,
+                               search_ids, search_regexp_strings);
+      }
+    }
+    closedir(dir);
+  } else {
+    const char *s = entry_name, *filename = entry_name;
+    while(*s) { if(*s == '/') { filename = s+1; } s++; }
+
+    if(strcmp(filename, db_filename) == 0) {
+      FILE *db_file = fopen(entry_name, "r");
+
+      if(!db_file) {
+        fprintf(stderr,
+                "mymail: Cannot open \"%s\" for reading: %s\n",
+                db_filename,
+                strerror(errno));
+        exit(EXIT_FAILURE);
+      }
+
+      if(fgets(raw_db_line, BUFFER_SIZE, db_file)) {
+        if(strncmp(raw_db_line, MYMAIL_DB_MAGIC_TOKEN, strlen(MYMAIL_DB_MAGIC_TOKEN))) {
+          fprintf(stderr,
+                  "mymail: Header line in '%s' does not match the mymail db format.\n",
+                  entry_name);
+          exit(EXIT_FAILURE);
+        }
+      } else {
+        fprintf(stderr,
+                "mymail: Cannot read the header line in '%s'.\n",
+                entry_name);
+        exit(EXIT_FAILURE);
+      }
+
+      search_in_db(nb_search_patterns, search_ids, search_regexp_strings,
+                   db_file);
+
+      fclose(db_file);
+    }
+  }
 }
 
 /*********************************************************************/
 
 void index_one_mbox_line(int nb_fields_to_parse, struct parsable_field *fields_to_parse,
-                         char *raw_line, FILE *db_file) {
+                         char *raw_mbox_line, FILE *db_file) {
   regmatch_t matches;
   int f;
   for(f = 0; f < nb_fields_to_parse; f++) {
-    if(regexec(&fields_to_parse[f].regexp, raw_line, 1, &matches, 0) == 0) {
+    if(regexec(&fields_to_parse[f].regexp, raw_mbox_line, 1, &matches, 0) == 0) {
       fprintf(db_file, "%s %s\n",
-              fields_to_parse[f].name,
-              raw_line + matches.rm_eo);
+              field_names[fields_to_parse[f].id],
+              raw_mbox_line + matches.rm_eo);
     }
   }
 }
 
-void index_mbox(const char *input_filename,
+void index_mbox(const char *mbox_filename,
                 int nb_fields_to_parse, struct parsable_field *fields_to_parse,
                 FILE *db_file) {
-  char raw_line[BUFFER_SIZE], full_line[BUFFER_SIZE];
+  char raw_mbox_line[BUFFER_SIZE], full_line[BUFFER_SIZE];
   char *end_of_full_line;
   FILE *file;
   int in_header, new_header;
   unsigned long int position_in_file;
 
-  file = fopen(input_filename, "r");
+  file = fopen(mbox_filename, "r");
 
   if(!file) {
-    fprintf(stderr, "mymail: Can not open '%s'.\n", input_filename);
+    fprintf(stderr, "mymail: Cannot open '%s'.\n", mbox_filename);
     if(paranoid) { exit(EXIT_FAILURE); }
     return;
   }
@@ -211,31 +344,29 @@ void index_mbox(const char *input_filename,
   end_of_full_line = 0;
   full_line[0] = '\0';
 
-  while(fgets(raw_line, BUFFER_SIZE, file)) {
-    if(strncmp(raw_line, "From ", 5) == 0) {
+  while(fgets(raw_mbox_line, BUFFER_SIZE, file)) {
+    if(strncmp(raw_mbox_line, "From ", 5) == 0) {
       if(in_header) {
         fprintf(stderr,
                 "Got a ^\"From \" in the header in %s:%lu.\n",
-                input_filename, position_in_file);
-        fprintf(stderr, "%s", raw_line);
+                mbox_filename, position_in_file);
+        fprintf(stderr, "%s", raw_mbox_line);
         if(paranoid) { exit(EXIT_FAILURE); }
       }
       in_header = 1;
       new_header = 1;
-    } else if(strncmp(raw_line, "\n", 1) == 0) {
+    } else if(strncmp(raw_mbox_line, "\n", 1) == 0) {
       if(in_header) { in_header = 0; }
     }
 
-    /* printf("PARSE %d %s", in_header, raw_line); */
-
     if(in_header) {
       if(new_header) {
-        fprintf(db_file, "mail %lu %s\n", position_in_file, input_filename);
+        fprintf(db_file, "mail %lu %s\n", position_in_file, mbox_filename);
         new_header = 0;
       }
 
-      if(raw_line[0] == ' ' || raw_line[0] == '\t') {
-        char *start = raw_line;
+      if(raw_mbox_line[0] == ' ' || raw_mbox_line[0] == '\t') {
+        char *start = raw_mbox_line;
         while(*start == ' ' || *start == '\t') start++;
         *(end_of_full_line++) = ' ';
         strcpy(end_of_full_line, start);
@@ -246,20 +377,22 @@ void index_mbox(const char *input_filename,
       }
 
       else {
-        /* if(!((raw_line[0] >= 'a' && raw_line[0] <= 'z') || */
-             /* (raw_line[0] >= 'A' && raw_line[0] <= 'Z'))) { */
-          /* fprintf(stderr, */
-                  /* "Header line syntax error %s:%lu.\n", */
-                  /* input_filename, position_in_file); */
-          /* fprintf(stderr, "%s", raw_line); */
-        /* } */
+        /*
+          if(!((raw_mbox_line[0] >= 'a' && raw_mbox_line[0] <= 'z') ||
+          (raw_mbox_line[0] >= 'A' && raw_mbox_line[0] <= 'Z'))) {
+          fprintf(stderr,
+          "Header line syntax error %s:%lu.\n",
+          mbox_filename, position_in_file);
+          fprintf(stderr, "%s", raw_mbox_line);
+          }
+        */
 
         if(full_line[0]) {
           index_one_mbox_line(nb_fields_to_parse, fields_to_parse, full_line, db_file);
         }
 
         end_of_full_line = full_line;
-        strcpy(end_of_full_line, raw_line);
+        strcpy(end_of_full_line, raw_mbox_line);
         while(*end_of_full_line && *end_of_full_line != '\n') {
           end_of_full_line++;
         }
@@ -268,48 +401,40 @@ void index_mbox(const char *input_filename,
 
     }
 
-    position_in_file += strlen(raw_line);
+    position_in_file += strlen(raw_mbox_line);
   }
 
   fclose(file);
 }
 
-int ignore_entry(const char *name) {
-  return
-    /* strcmp(name, ".") == 0 || */
-    /* strcmp(name, "..") == 0 || */
-    (name[0] == '.' && name[1] != '/');
-}
-
-void process_entry(const char *dir_name,
-                   int nb_fields_to_parse, struct parsable_field *fields_to_parse,
-                   FILE *db_file) {
+void recursive_index_mbox(FILE *db_file,
+                          const char *entry_name,
+                          int nb_fields_to_parse, struct parsable_field *fields_to_parse) {
   DIR *dir;
   struct dirent *dir_e;
   struct stat sb;
   char subname[PATH_MAX + 1];
 
-  if(lstat(dir_name, &sb) != 0) {
+  if(lstat(entry_name, &sb) != 0) {
     fprintf(stderr,
-            "mymail: Can not stat \"%s\": %s\n",
-            dir_name,
+            "mymail: Cannot stat \"%s\": %s\n",
+            entry_name,
             strerror(errno));
     exit(EXIT_FAILURE);
   }
 
-  dir = opendir(dir_name);
+  dir = opendir(entry_name);
 
   if(dir) {
-    printf("Processing directory '%s'.\n", dir_name);
     while((dir_e = readdir(dir))) {
       if(!ignore_entry(dir_e->d_name)) {
-        snprintf(subname, PATH_MAX, "%s/%s", dir_name, dir_e->d_name);
-        process_entry(subname, nb_fields_to_parse, fields_to_parse, db_file);
+        snprintf(subname, PATH_MAX, "%s/%s", entry_name, dir_e->d_name);
+        recursive_index_mbox(db_file, subname, nb_fields_to_parse, fields_to_parse);
       }
     }
     closedir(dir);
   } else {
-    index_mbox(dir_name, nb_fields_to_parse, fields_to_parse, db_file);
+    index_mbox(entry_name, nb_fields_to_parse, fields_to_parse, db_file);
   }
 }
 
@@ -325,38 +450,57 @@ static struct option long_options[] = {
   { "help", no_argument, 0, 'h' },
   { "version", no_argument, 0, 'v' },
   { "db-file", 1, 0, 'd' },
-  { "search-pattern", 1, 0, 's' },
+  { "db-root", 1, 0, 'r' },
+  { "search", 1, 0, 's' },
   { "index", 0, 0, 'i' },
   { 0, 0, 0, 0 }
 };
 
 static struct parsable_field fields_to_parse[] = {
   {
-    "from",
+    ID_FROM,
     "^\\([Ff][Rr][Oo][Mm]:\\|From\\) *",
     { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
   },
 
   {
-    "dest",
+    ID_DEST,
     "^\\([Tt][Oo]\\|[Cc][Cc]\\|[Bb][Cc][Cc]\\): *",
     { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
   },
+
+  {
+    ID_SUBJECT,
+    "^[Ss][Uu][Bb][Jj][Ee][Cc][Tt]: *",
+    { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
+  },
+
 };
 
+/*********************************************************************/
+
 int main(int argc, char **argv) {
   int error = 0, show_help = 0;
   const int nb_fields_to_parse = sizeof(fields_to_parse) / sizeof(struct parsable_field);
   char c;
   int f;
+  int nb_search_patterns;
+  char *search_pattern[MAX_NB_SEARCH_PATTERNS];
+
+  /* for(f = 0; f < argc; f++) { */
+    /* printf("arg %d \"%s\"\n", f, argv[f]); */
+  /* } */
 
   paranoid = 0;
   action_index = 0;
-  search_pattern = 0;
+  db_filename = 0;
+  db_root_path = 0;
 
   setlocale(LC_ALL, "");
 
-  while ((c = getopt_long(argc, argv, "hvip:s:",
+  nb_search_patterns = 0;
+
+  while ((c = getopt_long(argc, argv, "hvip:s:d:r:",
                           long_options, NULL)) != -1) {
 
     switch(c) {
@@ -373,17 +517,20 @@ int main(int argc, char **argv) {
       action_index = 1;
       break;
 
-    case 'p':
+    case 'd':
       db_filename = strdup(optarg);
-      /* printf("db_filename=\"%s\"\n", db_filename); */
+      break;
+
+    case 'r':
+      db_root_path = strdup(optarg);
       break;
 
     case 's':
-      if(search_pattern) {
-        fprintf(stderr, "mymail: Search pattern already defined.\n");
+      if(nb_search_patterns == MAX_NB_SEARCH_PATTERNS) {
+        fprintf(stderr, "mymail: Too many search patterns.\n");
         exit(EXIT_FAILURE);
       }
-      search_pattern = strdup(optarg);
+      search_pattern[nb_search_patterns++] = strdup(optarg);
       break;
 
     default:
@@ -394,10 +541,29 @@ int main(int argc, char **argv) {
 
   if(!db_filename) {
     char *default_db_filename = getenv("MYMAIL_DB_FILE");
-    if(!default_db_filename) { default_db_filename = "/tmp/mymail.db"; }
+
+    if(!default_db_filename) {
+      default_db_filename = "mymail.db";
+    }
+
     db_filename = strdup(default_db_filename);
   }
 
+  if(!db_root_path) {
+    char *default_db_root_path = getenv("MYMAIL_DB_ROOT");
+
+    if(default_db_root_path) {
+      db_root_path = strdup(default_db_root_path);
+    }
+  }
+
+  if(!db_root_path) {
+    fprintf(stderr,
+            "mymail: db root path is not set\n");
+    exit(EXIT_FAILURE);
+  }
+
+
   if(error) {
     print_usage(stderr);
     exit(EXIT_FAILURE);
@@ -409,10 +575,13 @@ int main(int argc, char **argv) {
   }
 
   if(action_index) {
-    FILE *db_file = fopen(db_filename, "w");
+    FILE *db_file;
+
+    db_file = fopen(db_filename, "w");
+
     if(!db_file) {
       fprintf(stderr,
-              "mymail: Can not open \"%s\" for writing: %s\n",
+              "mymail: Cannot open \"%s\" for writing: %s\n",
               db_filename,
               strerror(errno));
       exit(EXIT_FAILURE);
@@ -425,14 +594,17 @@ int main(int argc, char **argv) {
         fprintf(stderr,
                 "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
                 fields_to_parse[f].regexp_string,
-                fields_to_parse[f].name);
+                field_names[fields_to_parse[f].id]);
         exit(EXIT_FAILURE);
       }
     }
 
+    fprintf(db_file, "%s version_%s raw version\n", MYMAIL_DB_MAGIC_TOKEN, VERSION);
+
     while(optind < argc) {
-      process_entry(argv[optind],
-                    nb_fields_to_parse, fields_to_parse, db_file);
+      recursive_index_mbox(db_file,
+                           argv[optind],
+                           nb_fields_to_parse, fields_to_parse);
       optind++;
     }
 
@@ -444,36 +616,39 @@ int main(int argc, char **argv) {
   }
 
   else {
-    if(search_pattern) {
-      FILE *db_file;
-      char *search_name;
-      char *search_regexp_string;
-      search_name = search_pattern;
-      search_regexp_string = segment_next_field(search_pattern);
-      if(!*search_regexp_string) {
-        fprintf(stderr,
-                "Syntax error in the search pattern.\n");
-        exit(EXIT_FAILURE);
-      }
 
-      db_file = fopen(db_filename, "r");
+    if(nb_search_patterns > 0) {
+      int search_ids[MAX_NB_SEARCH_PATTERNS];
+      char *search_regexp_strings[MAX_NB_SEARCH_PATTERNS];
+      int m, n;
+
+      for(n = 0; n < nb_search_patterns; n++) {
+        search_regexp_strings[n] = segment_next_field(search_pattern[n]);
+        search_ids[n] = -1;
+        for(m = 0; (m < MAX_ID) && search_ids[n] == -1; m++) {
+          if(strncmp(field_names[m], search_pattern[n], strlen(search_pattern[n])) == 0) {
+            search_ids[n] = m;
+          }
+        }
+      }
 
-      if(!db_file) {
+      if(!*search_regexp_strings) {
         fprintf(stderr,
-                "mymail: Can not open \"%s\" for reading: %s\n",
-                db_filename,
-                strerror(errno));
+                "Syntax error in the search pattern.\n");
         exit(EXIT_FAILURE);
       }
 
-      search_in_db(search_name, search_regexp_string, db_file);
+      recursive_search_in_db(db_root_path,
+                             nb_search_patterns, search_ids, search_regexp_strings);
 
-      fclose(db_file);
-      free(search_pattern);
+      for(n = 0; n < nb_search_patterns; n++) {
+        free(search_pattern[n]);
+      }
     }
   }
 
   free(db_filename);
+  free(db_root_path);
 
   exit(EXIT_SUCCESS);
 }