3 * Copyright (c) 2013 Francois Fleuret
4 * Written by Francois Fleuret <francois@fleuret.org>
6 * This file is part of mymail.
8 * mymail is free software: you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 3 as
10 * published by the Free Software Foundation.
12 * mymail is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with mymail. If not, see <http://www.gnu.org/licenses/>.
24 This command is a dumb mail indexer. It can either (1) scan
25 directories containing mbox files, and create a db file containing
26 for each mail a list of fields computed from the header, or (2)
27 read such a db file and get all the mails matching regexp-defined
28 conditions on the fields.
30 It is low-tech, simple, light and fast.
47 #define MYMAIL_DB_MAGIC_TOKEN "mymail_index_file"
48 #define VERSION "0.9.1"
50 #define MAX_NB_SEARCH_CONDITIONS 10
52 #define BUFFER_SIZE 65536
55 char *db_filename_regexp_string;
57 char *db_filename_list;
58 char output_filename[PATH_MAX + 1];
64 /********************************************************************/
77 static char *field_names[] = {
87 /********************************************************************/
89 struct search_condition {
95 /********************************************************************/
97 struct parsable_field {
103 static struct parsable_field fields_to_parse[] = {
106 "^\\(From \\|[Ff][Rr][Oo][Mm]:\\|[R][r][E][e][P][p][L][l][Y][y]-[T][t][O][o]:\\)",
107 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
112 "^\\([Tt][Oo]\\|[Cc][Cc]\\|[Bb][Cc][Cc]\\): ",
113 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
118 "^[Ss][Uu][Bb][Jj][Ee][Cc][Tt]: ",
119 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
124 "^[Dd][Aa][Tt][Ee]: ",
125 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
130 /********************************************************************/
132 int xor(int a, int b) {
133 return (a && !b) || (!a && b);
136 char *segment_next_field(char *current) {
137 while(*current && *current != ' ') current++;
138 *current = '\0'; current++;
139 while(*current && *current == ' ') current++;
143 void remove_eof(char *c) {
144 while(*c && *c != '\n' && *c != '\r') c++;
148 /********************************************************************/
150 /* malloc with error checking. */
152 void *safe_malloc(size_t n) {
156 "mymail: cannot allocate memory: %s\n", strerror(errno));
162 /*********************************************************************/
164 void print_version(FILE *out) {
165 fprintf(out, "mymail version %s (%s)\n", VERSION, UNAME);
168 void print_usage(FILE *out) {
170 fprintf(out, "Written by Francois Fleuret <francois@fleuret.org>.\n");
172 fprintf(out, "Usage: mymail [options] [<mbox dir1> [<mbox dir2> ...]|<db file1> [<db file2> ...]]\n");
174 fprintf(out, " -h, --help\n");
175 fprintf(out, " show this help\n");
176 fprintf(out, " -v, --version\n");
177 fprintf(out, " print the version number\n");
178 fprintf(out, " -q, --quiet\n");
179 fprintf(out, " do not print information during search\n");
180 fprintf(out, " -p <db filename pattern>, --db-pattern <db filename pattern>\n");
181 fprintf(out, " set the db filename pattern for recursive search\n");
182 fprintf(out, " -r <db root path>, --db-root <db root path>\n");
183 fprintf(out, " set the db root path for recursive search\n");
184 fprintf(out, " -l <db filename list>, --db-list <db filename list>\n");
185 fprintf(out, " set the semicolon-separated list of db files for search\n");
186 fprintf(out, " -s <search pattern>, --search <search pattern>\n");
187 fprintf(out, " search for matching mails in the db file\n");
188 fprintf(out, " -d <db filename>, --db-file <db filename>\n");
189 fprintf(out, " set the db filename for indexing\n");
190 fprintf(out, " -i, --index\n");
191 fprintf(out, " index mails\n");
192 fprintf(out, " -o <output filename>, --output <output filename>\n");
193 fprintf(out, " set the result file, use stdout if unset\n");
196 /*********************************************************************/
198 int ignore_entry(const char *name) {
200 /* strcmp(name, ".") == 0 || */
201 /* strcmp(name, "..") == 0 || */
202 (name[0] == '.' && name[1] != '/');
205 int mbox_line_match_search(struct search_condition *condition,
206 int mbox_id, char *mbox_value) {
208 (condition->field_id == mbox_id ||
209 (condition->field_id == ID_PARTICIPANT && (mbox_id == ID_FROM || mbox_id == ID_TO)))
211 regexec(&condition->regexp, mbox_value, 0, 0, 0) == 0;
214 void search_in_db(FILE *db_file,
215 int nb_search_conditions,
216 struct search_condition *search_conditions,
219 int hits[MAX_NB_SEARCH_CONDITIONS];
220 char raw_db_line[BUFFER_SIZE];
221 char raw_mbox_line[BUFFER_SIZE];
222 char current_mail_filename[PATH_MAX + 1];
223 unsigned long int current_position_in_mail;
224 char *mbox_name, *mbox_value;
226 int already_written, m, n;
227 int last_mbox_line_was_empty;
228 int nb_body_conditions, nb_fulfilled_body_conditions;
230 current_position_in_mail = 0;
233 for(n = 0; n < nb_search_conditions; n++) { hits[n] = 0; }
235 nb_body_conditions = 0;
236 for(n = 0; n < nb_search_conditions; n++) {
237 if(search_conditions[n].field_id == ID_BODY) {
238 nb_body_conditions++;
242 strcpy(current_mail_filename, "");
244 while(fgets(raw_db_line, BUFFER_SIZE, db_file)) {
245 mbox_name = raw_db_line;
246 mbox_value = segment_next_field(raw_db_line);
248 if(strcmp("mail", mbox_name) == 0) {
249 char *position_in_file_string;
252 if(current_mail_filename[0]) {
254 /* We first check all conditions but the body ones */
256 for(n = 0; n < nb_search_conditions &&
257 ((search_conditions[n].field_id == ID_BODY) ||
258 xor(hits[n], search_conditions[n].negation)); n++);
260 if(n == nb_search_conditions) {
262 /* all conditions but the body ones are fine, check the body
265 nb_fulfilled_body_conditions = 0;
267 if(nb_body_conditions > 0) {
272 mail_file = fopen(current_mail_filename, "r");
276 "mymail: Cannot open mbox '%s' for body scan.\n",
277 current_mail_filename);
281 fseek(mail_file, current_position_in_mail, SEEK_SET);
283 if(fgets(raw_mbox_line, BUFFER_SIZE, mail_file)) {
284 while(nb_fulfilled_body_conditions < nb_body_conditions) {
285 last_mbox_line_was_empty = (raw_mbox_line[0] == '\n');
287 if(last_mbox_line_was_empty) { header = 0; }
290 for(n = 0; n < nb_search_conditions; n++) {
291 if(search_conditions[n].field_id == ID_BODY && !hits[n]) {
293 (regexec(&search_conditions[n].regexp, raw_mbox_line, 0, 0, 0) == 0);
295 nb_fulfilled_body_conditions++;
301 if(!fgets(raw_mbox_line, BUFFER_SIZE, mail_file) ||
302 (last_mbox_line_was_empty && strncmp(raw_mbox_line, "From ", 5) == 0))
310 if(nb_body_conditions == nb_fulfilled_body_conditions) {
313 mail_file = fopen(current_mail_filename, "r");
317 "mymail: Cannot open mbox '%s' for mail extraction.\n",
318 current_mail_filename);
322 fseek(mail_file, current_position_in_mail, SEEK_SET);
324 if(fgets(raw_mbox_line, BUFFER_SIZE, mail_file)) {
325 last_mbox_line_was_empty = 1;
326 fprintf(output_file, "%s", raw_mbox_line);
328 if(!fgets(raw_mbox_line, BUFFER_SIZE, mail_file) ||
329 (last_mbox_line_was_empty && strncmp(raw_mbox_line, "From ", 5) == 0))
331 last_mbox_line_was_empty = (raw_mbox_line[0] == '\n');
332 fprintf(output_file, "%s", raw_mbox_line);
341 for(n = 0; n < nb_search_conditions; n++) { hits[n] = 0; }
343 position_in_file_string = mbox_value;
344 mail_filename = segment_next_field(mbox_value);
345 current_position_in_mail = atol(position_in_file_string);
346 strcpy(current_mail_filename, mail_filename);
348 remove_eof(current_mail_filename);
354 for(m = 0; (m < MAX_ID) && mbox_id == -1; m++) {
355 if(strncmp(field_names[m], mbox_name, strlen(mbox_name)) == 0) {
359 for(n = 0; n < nb_search_conditions; n++) {
360 hits[n] |= mbox_line_match_search(&search_conditions[n],
361 mbox_id, mbox_value);
367 void recursive_search_in_db(const char *entry_name, regex_t *db_filename_regexp,
368 int nb_search_conditions,
369 struct search_condition *search_conditions,
372 struct dirent *dir_e;
374 char raw_db_line[BUFFER_SIZE];
375 char subname[PATH_MAX + 1];
377 if(lstat(entry_name, &sb) != 0) {
379 "mymail: Cannot stat \"%s\": %s\n",
385 dir = opendir(entry_name);
388 while((dir_e = readdir(dir))) {
389 if(!ignore_entry(dir_e->d_name)) {
390 snprintf(subname, PATH_MAX, "%s/%s", entry_name, dir_e->d_name);
391 recursive_search_in_db(subname, db_filename_regexp,
392 nb_search_conditions, search_conditions,
400 const char *s = entry_name, *filename = entry_name;
401 while(*s) { if(*s == '/') { filename = s+1; } s++; }
403 if(regexec(db_filename_regexp, filename, 0, 0, 0) == 0) {
404 FILE *db_file = fopen(entry_name, "r");
407 printf("Searching in '%s' ... ", entry_name);
412 "mymail: Cannot open \"%s\" for reading: %s\n",
418 if(fgets(raw_db_line, BUFFER_SIZE, db_file)) {
419 if(strncmp(raw_db_line, MYMAIL_DB_MAGIC_TOKEN, strlen(MYMAIL_DB_MAGIC_TOKEN))) {
421 "mymail: Header line in '%s' does not match the mymail db format.\n",
427 "mymail: Cannot read the header line in '%s'.\n",
432 search_in_db(db_file, nb_search_conditions, search_conditions, output_file);
443 /*********************************************************************/
445 void index_one_mbox_line(int nb_fields_to_parse, struct parsable_field *fields_to_parse,
446 char *raw_mbox_line, FILE *db_file) {
449 for(f = 0; f < nb_fields_to_parse; f++) {
450 if(regexec(&fields_to_parse[f].regexp, raw_mbox_line, 1, &matches, 0) == 0) {
451 fprintf(db_file, "%s %s\n",
452 field_names[fields_to_parse[f].id],
453 raw_mbox_line + matches.rm_eo);
458 void index_mbox(const char *mbox_filename,
459 int nb_fields_to_parse, struct parsable_field *fields_to_parse,
461 char raw_mbox_line[BUFFER_SIZE], full_line[BUFFER_SIZE];
462 char *end_of_full_line;
464 int in_header, new_header, last_mbox_line_was_empty;
465 unsigned long int position_in_file;
467 file = fopen(mbox_filename, "r");
470 fprintf(stderr, "mymail: Cannot open '%s'.\n", mbox_filename);
471 if(paranoid) { exit(EXIT_FAILURE); }
478 position_in_file = 0;
479 end_of_full_line = 0;
481 last_mbox_line_was_empty = 1;
483 while(fgets(raw_mbox_line, BUFFER_SIZE, file)) {
484 if(last_mbox_line_was_empty && strncmp(raw_mbox_line, "From ", 5) == 0) {
487 "Got a ^\"From \" in the header in %s:%lu.\n",
488 mbox_filename, position_in_file);
489 fprintf(stderr, "%s", raw_mbox_line);
490 if(paranoid) { exit(EXIT_FAILURE); }
494 } else if(raw_mbox_line[0] == '\n') {
495 if(in_header) { in_header = 0; }
498 last_mbox_line_was_empty = (raw_mbox_line[0] == '\n');
502 fprintf(db_file, "mail %lu %s\n", position_in_file, mbox_filename);
506 if(raw_mbox_line[0] == ' ' || raw_mbox_line[0] == '\t') {
507 char *start = raw_mbox_line;
508 while(*start == ' ' || *start == '\t') start++;
509 *(end_of_full_line++) = ' ';
510 strcpy(end_of_full_line, start);
511 while(*end_of_full_line && *end_of_full_line != '\n') {
514 *end_of_full_line = '\0';
519 if(!((raw_mbox_line[0] >= 'a' && raw_mbox_line[0] <= 'z') ||
520 (raw_mbox_line[0] >= 'A' && raw_mbox_line[0] <= 'Z'))) {
522 "Header line syntax error %s:%lu.\n",
523 mbox_filename, position_in_file);
524 fprintf(stderr, "%s", raw_mbox_line);
529 index_one_mbox_line(nb_fields_to_parse, fields_to_parse, full_line, db_file);
532 end_of_full_line = full_line;
533 strcpy(end_of_full_line, raw_mbox_line);
534 while(*end_of_full_line && *end_of_full_line != '\n') {
537 *end_of_full_line = '\0';
542 position_in_file += strlen(raw_mbox_line);
548 void recursive_index_mbox(FILE *db_file,
549 const char *entry_name,
550 int nb_fields_to_parse, struct parsable_field *fields_to_parse) {
552 struct dirent *dir_e;
554 char subname[PATH_MAX + 1];
556 if(lstat(entry_name, &sb) != 0) {
558 "mymail: Cannot stat \"%s\": %s\n",
564 dir = opendir(entry_name);
567 while((dir_e = readdir(dir))) {
568 if(!ignore_entry(dir_e->d_name)) {
569 snprintf(subname, PATH_MAX, "%s/%s", entry_name, dir_e->d_name);
570 recursive_index_mbox(db_file, subname, nb_fields_to_parse, fields_to_parse);
575 index_mbox(entry_name, nb_fields_to_parse, fields_to_parse, db_file);
579 /*********************************************************************/
581 /* For long options that have no equivalent short option, use a
582 non-character as a pseudo short option, starting with CHAR_MAX + 1. */
584 OPT_BASH_MODE = CHAR_MAX + 1
587 static struct option long_options[] = {
588 { "help", no_argument, 0, 'h' },
589 { "version", no_argument, 0, 'v' },
590 { "quiet", no_argument, 0, 'q' },
591 { "db-file", 1, 0, 'd' },
592 { "db-pattern", 1, 0, 'p' },
593 { "db-root", 1, 0, 'r' },
594 { "db-list", 1, 0, 'l' },
595 { "search", 1, 0, 's' },
596 { "index", 0, 0, 'i' },
597 { "output", 1, 0, 'o' },
601 /*********************************************************************/
603 int main(int argc, char **argv) {
604 int error = 0, show_help = 0;
605 const int nb_fields_to_parse = sizeof(fields_to_parse) / sizeof(struct parsable_field);
608 int nb_search_conditions;
609 char *search_condition_strings[MAX_NB_SEARCH_CONDITIONS];
616 db_filename_list = 0;
619 setlocale(LC_ALL, "");
621 nb_search_conditions = 0;
623 while ((c = getopt_long(argc, argv, "hvqip:s:d:r:l:o:",
624 long_options, NULL)) != -1) {
633 print_version(stdout);
645 db_filename = strdup(optarg);
649 db_filename_regexp_string = strdup(optarg);
653 strncpy(output_filename, optarg, PATH_MAX);
657 db_root_path = strdup(optarg);
661 db_filename_list = strdup(optarg);
665 if(nb_search_conditions == MAX_NB_SEARCH_CONDITIONS) {
666 fprintf(stderr, "mymail: Too many search patterns.\n");
669 search_condition_strings[nb_search_conditions++] = strdup(optarg);
679 char *default_db_filename = getenv("MYMAIL_DB_FILE");
681 if(!default_db_filename) {
682 default_db_filename = "mymail.db";
685 db_filename = strdup(default_db_filename);
688 if(!db_filename_regexp_string) {
689 char *default_db_filename_regexp_string = getenv("MYMAIL_DB_PATTERN");
691 if(!default_db_filename_regexp_string) {
692 default_db_filename_regexp_string = "^mymail.db$";
695 db_filename_regexp_string = strdup(default_db_filename_regexp_string);
699 char *default_db_root_path = getenv("MYMAIL_DB_ROOT");
701 if(default_db_root_path) {
702 db_root_path = strdup(default_db_root_path);
706 if(!db_filename_list) {
707 char *default_db_filename_list = getenv("MYMAIL_DB_LIST");
709 if(default_db_filename_list) {
710 db_filename_list = strdup(default_db_filename_list);
714 if(output_filename[0]) {
715 output_file = fopen(output_filename, "w");
719 "mymail: Cannot open result file \"%s\" for writing: %s\n",
725 output_file = stdout;
742 db_file = fopen(db_filename, "w");
746 "mymail: Cannot open \"%s\" for writing: %s\n",
752 for(f = 0; f < nb_fields_to_parse; f++) {
753 if(regcomp(&fields_to_parse[f].regexp,
754 fields_to_parse[f].regexp_string,
757 "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
758 fields_to_parse[f].regexp_string,
759 field_names[fields_to_parse[f].id]);
764 fprintf(db_file, "%s version_%s raw\n", MYMAIL_DB_MAGIC_TOKEN, VERSION);
766 while(optind < argc) {
767 recursive_index_mbox(db_file,
769 nb_fields_to_parse, fields_to_parse);
775 for(f = 0; f < nb_fields_to_parse; f++) {
776 regfree(&fields_to_parse[f].regexp);
782 if(nb_search_conditions > 0) {
783 struct search_condition search_conditions[MAX_NB_SEARCH_CONDITIONS];
784 char *search_field, *search_regexp_string;
787 for(n = 0; n < nb_search_conditions; n++) {
788 search_field = search_condition_strings[n];
789 search_regexp_string = segment_next_field(search_condition_strings[n]);
791 if(search_field[0] == '!') {
793 search_conditions[n].negation = 1;
795 search_conditions[n].negation = 0;
798 search_conditions[n].field_id = -1;
799 for(m = 0; (m < MAX_ID) && search_conditions[n].field_id == -1; m++) {
800 if(strncmp(field_names[m], search_field, strlen(search_field)) == 0) {
801 search_conditions[n].field_id = m;
805 if(search_conditions[n].field_id == -1) {
807 "mymail: Syntax error in field name \"%s\".\n",
812 if(regcomp(&search_conditions[n].regexp,
813 search_regexp_string,
816 "mymail: Syntax error in regexp \"%s\" for field \"%s\".\n",
817 search_regexp_string,
818 field_names[search_conditions[n].field_id]);
823 /* Recursive search if db_root_path is set */
826 regex_t db_filename_regexp;
827 if(regcomp(&db_filename_regexp,
828 db_filename_regexp_string,
831 "mymail: Syntax error in regexp \"%s\".\n",
832 db_filename_regexp_string);
836 recursive_search_in_db(db_root_path, &db_filename_regexp,
837 nb_search_conditions, search_conditions,
840 regfree(&db_filename_regexp);
843 /* Search in all db files listed in db_filename_list */
845 if(db_filename_list) {
846 char db_filename[PATH_MAX + 1];
850 s = db_filename_list;
854 while(*s == ';') { s++; }
855 while(*s && *s != ';') { *t++ = *s++; }
859 db_file = fopen(db_filename, "r");
863 "mymail: Cannot open \"%s\" for reading: %s\n",
869 search_in_db(db_file, nb_search_conditions, search_conditions, output_file);
876 /* Search in all db files listed in the command arguments */
878 while(optind < argc) {
879 FILE *db_file = fopen(argv[optind], "r");
883 "mymail: Cannot open \"%s\" for reading: %s\n",
889 search_in_db(db_file, nb_search_conditions, search_conditions, output_file);
895 for(n = 0; n < nb_search_conditions; n++) {
896 regfree(&search_conditions[n].regexp);
897 free(search_condition_strings[n]);
902 if(output_file != stdout) {
907 free(db_filename_regexp_string);
909 free(db_filename_list);