3 * Copyright (c) 2013 Francois Fleuret
4 * Written by Francois Fleuret <francois@fleuret.org>
6 * This file is part of mymail.
8 * mymail is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 3 as
10 * published by the Free Software Foundation.
12 * mymail is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with mymail. If not, see <http://www.gnu.org/licenses/>.
24 This command is a dumb mail indexer. It can either (1) scan
25 directories containing mbox files, and create a db file containing
26 for each mail a list of fields computed from the header, or (2)
27 read such a db file and get all the mails matching regexp-defined
28 conditions on the fields.
30 It is low-tech, simple, light and fast.
48 #define MYMAIL_DB_MAGIC_TOKEN "mymail_index_file"
49 #define VERSION "0.9.1"
51 #define MAX_NB_SEARCH_CONDITIONS 10
53 #define BUFFER_SIZE 65536
56 char *db_filename_regexp_string;
58 char *db_filename_list;
59 char output_filename[PATH_MAX + 1];
67 /********************************************************************/
82 static char *field_names[] = {
94 /********************************************************************/
96 struct search_condition {
100 time_t interval_start, interval_stop;
103 /********************************************************************/
105 struct parsable_field {
111 static struct parsable_field fields_to_parse[] = {
115 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
120 "^\\([Ff][Rr][Oo][Mm]:\\|[Rr][Ee][Pp][Ll][Yy]-[Tt][Oo]:\\|[Ss][Ee][Nn][Dd][Ee][Rr]:\\)",
121 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
126 "^\\([Tt][Oo]\\|[Cc][Cc]\\|[Bb][Cc][Cc]\\): ",
127 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
132 "^[Ss][Uu][Bb][Jj][Ee][Cc][Tt]: ",
133 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
138 "^[Dd][Aa][Tt][Ee]: ",
139 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
144 /********************************************************************/
146 int xor(int a, int b) {
147 return (a && !b) || (!a && b);
150 char *segment_next_field(char *current) {
151 while(*current && *current != ' ') current++;
152 *current = '\0'; current++;
153 while(*current && *current == ' ') current++;
157 void remove_eof(char *c) {
158 while(*c && *c != '\n' && *c != '\r') c++;
162 /********************************************************************/
164 /* malloc with error checking. */
166 void *safe_malloc(size_t n) {
170 "mymail: cannot allocate memory: %s\n", strerror(errno));
176 /*********************************************************************/
178 void print_version(FILE *out) {
179 fprintf(out, "mymail version %s (%s)\n", VERSION, UNAME);
182 void print_usage(FILE *out) {
184 fprintf(out, "Written by Francois Fleuret <francois@fleuret.org>.\n");
186 fprintf(out, "Usage: mymail [options] [<mbox dir1> [<mbox dir2> ...]|<db file1> [<db file2> ...]]\n");
188 fprintf(out, " -h, --help\n");
189 fprintf(out, " show this help\n");
190 fprintf(out, " -v, --version\n");
191 fprintf(out, " print the version number\n");
192 fprintf(out, " -q, --quiet\n");
193 fprintf(out, " do not print information during search\n");
194 fprintf(out, " -p <db filename pattern>, --db-pattern <db filename pattern>\n");
195 fprintf(out, " set the db filename pattern for recursive search\n");
196 fprintf(out, " -r <db root path>, --db-root <db root path>\n");
197 fprintf(out, " set the db root path for recursive search\n");
198 fprintf(out, " -l <db filename list>, --db-list <db filename list>\n");
199 fprintf(out, " set the semicolon-separated list of db files for search\n");
200 fprintf(out, " -s <search pattern>, --search <search pattern>\n");
201 fprintf(out, " search for matching mails in the db file\n");
202 fprintf(out, " -d <db filename>, --db-file <db filename>\n");
203 fprintf(out, " set the db filename for indexing\n");
204 fprintf(out, " -i, --index\n");
205 fprintf(out, " index mails\n");
206 fprintf(out, " -o <output filename>, --output <output filename>\n");
207 fprintf(out, " set the result file, use stdout if unset\n");
210 /*********************************************************************/
212 int ignore_entry(const char *name) {
214 /* strcmp(name, ".") == 0 || */
215 /* strcmp(name, "..") == 0 || */
216 (name[0] == '.' && name[1] != '/');
219 int mbox_line_match_search(struct search_condition *condition,
220 int mbox_id, char *mbox_value) {
222 if(condition->field_id == ID_INTERVAL) {
223 if(mbox_id == ID_LEADING_LINE) {
229 while(*c && *c != ' ') c++; while(*c && *c == ' ') c++;
230 strptime(c, "%a %b %e %k:%M:%S %Y", &tm);
233 return (t >= condition->interval_start &&
234 (condition->interval_stop == 0 ||
235 t <= condition->interval_stop));
243 (condition->field_id == mbox_id)
247 (condition->field_id == ID_PARTICIPANT && (mbox_id == ID_LEADING_LINE ||
248 mbox_id == ID_FROM ||
252 (condition->field_id == ID_FROM && mbox_id == ID_LEADING_LINE)
256 regexec(&condition->regexp, mbox_value, 0, 0, 0) == 0;
260 void search_in_db(FILE *db_file,
261 int nb_search_conditions,
262 struct search_condition *search_conditions,
265 int hits[MAX_NB_SEARCH_CONDITIONS];
266 char raw_db_line[BUFFER_SIZE];
267 char raw_mbox_line[BUFFER_SIZE];
268 char current_mail_filename[PATH_MAX + 1];
269 unsigned long int current_position_in_mail;
270 char *mbox_name, *mbox_value;
272 int already_written, m, n;
273 int last_mbox_line_was_empty;
274 int nb_body_conditions, nb_fulfilled_body_conditions;
276 current_position_in_mail = 0;
279 for(n = 0; n < nb_search_conditions; n++) { hits[n] = 0; }
281 nb_body_conditions = 0;
282 for(n = 0; n < nb_search_conditions; n++) {
283 if(search_conditions[n].field_id == ID_BODY) {
284 nb_body_conditions++;
288 strcpy(current_mail_filename, "");
290 while(fgets(raw_db_line, BUFFER_SIZE, db_file)) {
291 mbox_name = raw_db_line;
292 mbox_value = segment_next_field(raw_db_line);
294 if(strcmp("mail", mbox_name) == 0) {
295 char *position_in_file_string;
298 if(current_mail_filename[0]) {
300 /* We first check all conditions but the body ones */
302 for(n = 0; n < nb_search_conditions &&
303 ((search_conditions[n].field_id == ID_BODY) ||
304 xor(hits[n], search_conditions[n].negation)); n++);
306 if(n == nb_search_conditions) {
308 /* all conditions but the body ones are fine, check the body
311 nb_fulfilled_body_conditions = 0;
313 if(nb_body_conditions > 0) {
318 mail_file = fopen(current_mail_filename, "r");
322 "mymail: Cannot open mbox '%s' for body scan.\n",
323 current_mail_filename);
327 fseek(mail_file, current_position_in_mail, SEEK_SET);
329 if(fgets(raw_mbox_line, BUFFER_SIZE, mail_file)) {
330 while(nb_fulfilled_body_conditions < nb_body_conditions) {
331 last_mbox_line_was_empty = (raw_mbox_line[0] == '\n');
333 if(last_mbox_line_was_empty) { header = 0; }
336 for(n = 0; n < nb_search_conditions; n++) {
337 if(search_conditions[n].field_id == ID_BODY && !hits[n]) {
339 (regexec(&search_conditions[n].regexp, raw_mbox_line, 0, 0, 0) == 0);
341 nb_fulfilled_body_conditions++;
347 if(!fgets(raw_mbox_line, BUFFER_SIZE, mail_file) ||
348 (last_mbox_line_was_empty && strncmp(raw_mbox_line, "From ", 5) == 0))
356 if(nb_body_conditions == nb_fulfilled_body_conditions) {
359 mail_file = fopen(current_mail_filename, "r");
363 "mymail: Cannot open mbox '%s' for mail extraction.\n",
364 current_mail_filename);
368 fseek(mail_file, current_position_in_mail, SEEK_SET);
370 if(fgets(raw_mbox_line, BUFFER_SIZE, mail_file)) {
371 last_mbox_line_was_empty = 1;
372 fprintf(output_file, "%s", raw_mbox_line);
374 if(!fgets(raw_mbox_line, BUFFER_SIZE, mail_file) ||
375 (last_mbox_line_was_empty && strncmp(raw_mbox_line, "From ", 5) == 0))
377 last_mbox_line_was_empty = (raw_mbox_line[0] == '\n');
378 fprintf(output_file, "%s", raw_mbox_line);
387 for(n = 0; n < nb_search_conditions; n++) { hits[n] = 0; }
389 position_in_file_string = mbox_value;
390 mail_filename = segment_next_field(mbox_value);
391 current_position_in_mail = atol(position_in_file_string);
392 strcpy(current_mail_filename, mail_filename);
394 remove_eof(current_mail_filename);
400 for(m = 0; (m < MAX_ID) && mbox_id == -1; m++) {
401 if(strncmp(field_names[m], mbox_name, strlen(mbox_name)) == 0) {
405 for(n = 0; n < nb_search_conditions; n++) {
406 hits[n] |= mbox_line_match_search(&search_conditions[n],
407 mbox_id, mbox_value);
413 void recursive_search_in_db(const char *entry_name, regex_t *db_filename_regexp,
414 int nb_search_conditions,
415 struct search_condition *search_conditions,
418 struct dirent *dir_e;
420 char raw_db_line[BUFFER_SIZE];
421 char subname[PATH_MAX + 1];
423 if(lstat(entry_name, &sb) != 0) {
425 "mymail: Cannot stat \"%s\": %s\n",
431 dir = opendir(entry_name);
434 while((dir_e = readdir(dir))) {
435 if(!ignore_entry(dir_e->d_name)) {
436 snprintf(subname, PATH_MAX, "%s/%s", entry_name, dir_e->d_name);
437 recursive_search_in_db(subname, db_filename_regexp,
438 nb_search_conditions, search_conditions,
446 const char *s = entry_name, *filename = entry_name;
447 while(*s) { if(*s == '/') { filename = s+1; } s++; }
449 if(regexec(db_filename_regexp, filename, 0, 0, 0) == 0) {
450 FILE *db_file = fopen(entry_name, "r");
453 printf("Searching in '%s' ... ", entry_name);
459 "mymail: Cannot open \"%s\" for reading: %s\n",
465 if(fgets(raw_db_line, BUFFER_SIZE, db_file)) {
466 if(strncmp(raw_db_line, MYMAIL_DB_MAGIC_TOKEN, strlen(MYMAIL_DB_MAGIC_TOKEN))) {
468 "mymail: Header line in '%s' does not match the mymail db format.\n",
474 "mymail: Cannot read the header line in '%s'.\n",
479 search_in_db(db_file, nb_search_conditions, search_conditions, output_file);
491 /*********************************************************************/
493 void index_one_mbox_line(int nb_fields_to_parse, struct parsable_field *fields_to_parse,
494 char *raw_mbox_line, FILE *db_file) {
497 for(f = 0; f < nb_fields_to_parse; f++) {
498 if(regexec(&fields_to_parse[f].regexp, raw_mbox_line, 1, &matches, 0) == 0) {
499 fprintf(db_file, "%s %s\n",
500 field_names[fields_to_parse[f].id],
501 raw_mbox_line + matches.rm_eo);
506 void index_mbox(const char *mbox_filename,
507 int nb_fields_to_parse, struct parsable_field *fields_to_parse,
509 char raw_mbox_line[BUFFER_SIZE], full_line[BUFFER_SIZE];
510 char *end_of_full_line;
512 int in_header, new_header, last_mbox_line_was_empty;
513 unsigned long int position_in_file;
515 file = fopen(mbox_filename, "r");
518 fprintf(stderr, "mymail: Cannot open '%s'.\n", mbox_filename);
519 if(paranoid) { exit(EXIT_FAILURE); }
526 position_in_file = 0;
527 end_of_full_line = 0;
529 last_mbox_line_was_empty = 1;
531 while(fgets(raw_mbox_line, BUFFER_SIZE, file)) {
532 if(last_mbox_line_was_empty && strncmp(raw_mbox_line, "From ", 5) == 0) {
535 "Got a ^\"From \" in the header in %s:%lu.\n",
536 mbox_filename, position_in_file);
537 fprintf(stderr, "%s", raw_mbox_line);
538 if(paranoid) { exit(EXIT_FAILURE); }
542 } else if(raw_mbox_line[0] == '\n') {
543 if(in_header) { in_header = 0; }
546 last_mbox_line_was_empty = (raw_mbox_line[0] == '\n');
550 fprintf(db_file, "mail %lu %s\n", position_in_file, mbox_filename);
554 if(raw_mbox_line[0] == ' ' || raw_mbox_line[0] == '\t') {
555 char *start = raw_mbox_line;
556 while(*start == ' ' || *start == '\t') start++;
557 *(end_of_full_line++) = ' ';
558 strcpy(end_of_full_line, start);
559 while(*end_of_full_line && *end_of_full_line != '\n') {
562 *end_of_full_line = '\0';
567 if(!((raw_mbox_line[0] >= 'a' && raw_mbox_line[0] <= 'z') ||
568 (raw_mbox_line[0] >= 'A' && raw_mbox_line[0] <= 'Z'))) {
570 "Header line syntax error %s:%lu.\n",
571 mbox_filename, position_in_file);
572 fprintf(stderr, "%s", raw_mbox_line);
577 index_one_mbox_line(nb_fields_to_parse, fields_to_parse, full_line, db_file);
580 end_of_full_line = full_line;
581 strcpy(end_of_full_line, raw_mbox_line);
582 while(*end_of_full_line && *end_of_full_line != '\n') {
585 *end_of_full_line = '\0';
590 position_in_file += strlen(raw_mbox_line);
596 void recursive_index_mbox(FILE *db_file,
597 const char *entry_name,
598 int nb_fields_to_parse, struct parsable_field *fields_to_parse) {
600 struct dirent *dir_e;
602 char subname[PATH_MAX + 1];
604 if(lstat(entry_name, &sb) != 0) {
606 "mymail: Cannot stat \"%s\": %s\n",
612 dir = opendir(entry_name);
615 while((dir_e = readdir(dir))) {
616 if(!ignore_entry(dir_e->d_name)) {
617 snprintf(subname, PATH_MAX, "%s/%s", entry_name, dir_e->d_name);
618 recursive_index_mbox(db_file, subname, nb_fields_to_parse, fields_to_parse);
623 index_mbox(entry_name, nb_fields_to_parse, fields_to_parse, db_file);
627 /*********************************************************************/
629 /* For long options that have no equivalent short option, use a
630 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
632 OPT_BASH_MODE = CHAR_MAX + 1
635 static struct option long_options[] = {
636 { "help", no_argument, 0, 'h' },
637 { "version", no_argument, 0, 'v' },
638 { "quiet", no_argument, 0, 'q' },
639 { "db-file", 1, 0, 'd' },
640 { "db-pattern", 1, 0, 'p' },
641 { "db-root", 1, 0, 'r' },
642 { "db-list", 1, 0, 'l' },
643 { "search", 1, 0, 's' },
644 { "index", 0, 0, 'i' },
645 { "output", 1, 0, 'o' },
649 /*********************************************************************/
651 int main(int argc, char **argv) {
652 int error = 0, show_help = 0;
653 const int nb_fields_to_parse = sizeof(fields_to_parse) / sizeof(struct parsable_field);
656 int nb_search_conditions;
657 char *search_condition_strings[MAX_NB_SEARCH_CONDITIONS];
664 db_filename_list = 0;
667 setlocale(LC_ALL, "");
669 nb_search_conditions = 0;
671 while ((c = getopt_long(argc, argv, "hvqip:s:d:r:l:o:",
672 long_options, NULL)) != -1) {
681 print_version(stdout);
693 db_filename = strdup(optarg);
697 db_filename_regexp_string = strdup(optarg);
701 strncpy(output_filename, optarg, PATH_MAX);
705 db_root_path = strdup(optarg);
709 db_filename_list = strdup(optarg);
713 if(nb_search_conditions == MAX_NB_SEARCH_CONDITIONS) {
714 fprintf(stderr, "mymail: Too many search patterns.\n");
717 search_condition_strings[nb_search_conditions++] = strdup(optarg);
727 char *default_db_filename = getenv("MYMAIL_DB_FILE");
729 if(!default_db_filename) {
730 default_db_filename = "mymail.db";
733 db_filename = strdup(default_db_filename);
736 if(!db_filename_regexp_string) {
737 char *default_db_filename_regexp_string = getenv("MYMAIL_DB_PATTERN");
739 if(!default_db_filename_regexp_string) {
740 default_db_filename_regexp_string = "^mymail.db$";
743 db_filename_regexp_string = strdup(default_db_filename_regexp_string);
747 char *default_db_root_path = getenv("MYMAIL_DB_ROOT");
749 if(default_db_root_path) {
750 db_root_path = strdup(default_db_root_path);
754 if(!db_filename_list) {
755 char *default_db_filename_list = getenv("MYMAIL_DB_LIST");
757 if(default_db_filename_list) {
758 db_filename_list = strdup(default_db_filename_list);
762 if(output_filename[0]) {
763 output_file = fopen(output_filename, "w");
767 "mymail: Cannot open result file \"%s\" for writing: %s\n",
773 output_file = stdout;
790 db_file = fopen(db_filename, "w");
794 "mymail: Cannot open \"%s\" for writing: %s\n",
800 for(f = 0; f < nb_fields_to_parse; f++) {
801 if(regcomp(&fields_to_parse[f].regexp,
802 fields_to_parse[f].regexp_string,
805 "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
806 fields_to_parse[f].regexp_string,
807 field_names[fields_to_parse[f].id]);
812 fprintf(db_file, "%s version_%s raw\n", MYMAIL_DB_MAGIC_TOKEN, VERSION);
814 while(optind < argc) {
815 recursive_index_mbox(db_file,
817 nb_fields_to_parse, fields_to_parse);
823 for(f = 0; f < nb_fields_to_parse; f++) {
824 regfree(&fields_to_parse[f].regexp);
830 if(nb_search_conditions > 0) {
831 struct search_condition search_conditions[MAX_NB_SEARCH_CONDITIONS];
832 char *search_field, *search_regexp_string;
835 for(n = 0; n < nb_search_conditions; n++) {
836 search_field = search_condition_strings[n];
838 if(search_field[0] == '!') {
840 search_conditions[n].negation = 1;
842 search_conditions[n].negation = 0;
845 if(strcmp(search_field, "today") == 0) {
846 search_conditions[n].field_id = ID_INTERVAL;
847 search_conditions[n].interval_start = time(0) - 3600 * 24;
848 search_conditions[n].interval_stop = 0;
851 else if(strcmp(search_field, "yesterday") == 0) {
852 search_conditions[n].field_id = ID_INTERVAL;
853 search_conditions[n].interval_start = time(0) - 2 * 3600 * 24;
854 search_conditions[n].interval_stop = time(0) - 3600 * 24;
858 search_regexp_string = segment_next_field(search_condition_strings[n]);
860 search_conditions[n].field_id = -1;
862 for(m = 0; (m < MAX_ID) && search_conditions[n].field_id == -1; m++) {
863 if(strncmp(field_names[m], search_field, strlen(search_field)) == 0) {
864 search_conditions[n].field_id = m;
868 if(search_conditions[n].field_id == -1) {
870 "mymail: Syntax error in field name \"%s\".\n",
875 if(regcomp(&search_conditions[n].regexp,
876 search_regexp_string,
879 "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
880 search_regexp_string,
881 field_names[search_conditions[n].field_id]);
887 /* Recursive search if db_root_path is set */
890 regex_t db_filename_regexp;
891 if(regcomp(&db_filename_regexp,
892 db_filename_regexp_string,
895 "mymail: Syntax error in regexp \"%s\".\n",
896 db_filename_regexp_string);
900 recursive_search_in_db(db_root_path, &db_filename_regexp,
901 nb_search_conditions, search_conditions,
904 regfree(&db_filename_regexp);
907 /* Search in all db files listed in db_filename_list */
909 if(db_filename_list) {
910 char db_filename[PATH_MAX + 1];
914 s = db_filename_list;
918 while(*s == ';') { s++; }
919 while(*s && *s != ';') { *t++ = *s++; }
923 db_file = fopen(db_filename, "r");
927 "mymail: Cannot open \"%s\" for reading: %s\n",
933 search_in_db(db_file, nb_search_conditions, search_conditions, output_file);
940 /* Search in all db files listed in the command arguments */
942 while(optind < argc) {
943 FILE *db_file = fopen(argv[optind], "r");
947 "mymail: Cannot open \"%s\" for reading: %s\n",
953 search_in_db(db_file, nb_search_conditions, search_conditions, output_file);
959 for(n = 0; n < nb_search_conditions; n++) {
960 regfree(&search_conditions[n].regexp);
961 free(search_condition_strings[n]);
966 if(output_file != stdout) {
971 free(db_filename_regexp_string);
973 free(db_filename_list);