From ab7b6e26f35ac1dfc88d9bf1e09dd289a30ea782 Mon Sep 17 00:00:00 2001 From: Francois Fleuret Date: Wed, 7 Apr 2010 18:56:13 +0200 Subject: [PATCH] Removed the MD5 option which was useless. --- finddup.1 | 25 +++++--------------- finddup.c | 71 +------------------------------------------------------ 2 files changed, 7 insertions(+), 89 deletions(-) diff --git a/finddup.1 b/finddup.1 index faaef4d..e262386 100644 --- a/finddup.1 +++ b/finddup.1 @@ -61,30 +61,17 @@ show the real path of the files .TP \fB-i\fR, \fB--same-inodes-are-different\fR files with same inode are considered as different -.TP -\fB-m\fR, \fB--md5\fR -use MD5 hashing (if compiled with the option) .SH "BUGS" None known, probably many. Valgrind does not complain though. -The MD5 hashing is not satisfactory. It is computed for a file only if -the said file has to be read fully for a comparison (i.e. two files -match and we have to read them completely). - -Hence, in practice lot of partial MD5s are computed, which costs a lot -of cpu and is useless. This often hurts more than it helps. The only -case when it should really be useful is when you have plenty of -different files of same size, and lot of similar ones, which does not -happen often. - -Forcing the files to be read fully so that the MD5s are properly -computed is not okay neither, since it would fully read certain files, -even if we will never need their MD5s. - -Anyway, it has to be compiled in with 'make WITH_MD5=yes', and even in -that case it will be off by default +The current algorithm is dumb, that is it does not use any hashing of +the file content. I tried md5 on the whole file, which is not +satisfactory because files are often never read entirely hence the md5 +can not be properly computed. I also tried XOR of the first 4, 16 and +256 bytes with rejection as soon as one does not match. Did not help +either. .SH "WISH LIST" diff --git a/finddup.c b/finddup.c index 5167a59..8f2b6c9 100644 --- a/finddup.c +++ b/finddup.c @@ -40,9 +40,6 @@ #include #include #include -#ifdef WITH_MD5 -#include -#endif /* 1M really helps compared to 64k */ #define READ_BUFFER_SIZE (1024 * 1024) @@ -74,10 +71,6 @@ int same_inodes_are_different = 0; /* 1 means that comparison between two files with same inode will always be false */ -#ifdef WITH_MD5 -int use_md5 = 0; /* 1 means we keep an MD5 signature for each file */ -#endif - /********************************************************************/ /* malloc with error checking. */ @@ -109,10 +102,6 @@ struct file_node { ino_t inode; int group_id; /* one per identical file content */ int dir_id; /* 1 for DIR1, and 2 for DIR2 */ -#ifdef WITH_MD5 - int md5_computed; - unsigned char md5[MD5_DIGEST_LENGTH]; -#endif }; void file_list_delete(struct file_node *head) { @@ -140,25 +129,6 @@ int same_content(struct file_node *f1, struct file_node *f2, char *buffer1, char *buffer2) { int fd1, fd2, s1, s2; -#ifdef WITH_MD5 - MD5_CTX c1, c2; - - if(use_md5) { - if(f1->md5_computed && f2->md5_computed) { - if(!memcmp(f1->md5, f2->md5, MD5_DIGEST_LENGTH)) { - return 0; - } - } else { - if(!f1->md5_computed) { - MD5_Init(&c1); - } - if(!f2->md5_computed) { - MD5_Init(&c2); - } - } - } -#endif - fd1 = open(f1->name, O_RDONLY); fd2 = open(f2->name, O_RDONLY); @@ -177,35 +147,14 @@ int same_content(struct file_node *f1, struct file_node *f2, if(s1 == 0) { close(fd1); close(fd2); -#ifdef WITH_MD5 - if(use_md5) { - if(!f1->md5_computed) { - MD5_Final(f1->md5, &c1); - f1->md5_computed = 1; - } - if(!f2->md5_computed) { - MD5_Final(f2->md5, &c2); - f2->md5_computed = 1; - } - } -#endif return 1; } else { if(memcmp(buffer1, buffer2, s1)) { + /* printf("size_to_read = %d\n", size_to_read); */ close(fd1); close(fd2); return 0; } -#ifdef WITH_MD5 - if(use_md5) { - if(!f1->md5_computed) { - MD5_Update(&c1, buffer1, s1); - } - if(!f2->md5_computed) { - MD5_Update(&c2, buffer2, s2); - } - } -#endif } } else { fprintf(stderr, @@ -281,9 +230,6 @@ struct file_node *scan_directory(struct file_node *tail, const char *name) { tmp->inode = sb.st_ino; tmp->group_id = -1; tmp->dir_id = -1; -#ifdef WITH_MD5 - tmp->md5_computed = 0; -#endif tail = tmp; } } @@ -572,9 +518,6 @@ void usage(FILE *out) { fprintf(out, " -r, --real-paths show the real file paths\n"); fprintf(out, " -i, --same-inodes-are-different\n"); fprintf(out, " consider files with same inode as different\n"); -#ifdef WITH_MD5 - fprintf(out, " -m, --md5 use MD5 hashing\n"); -#endif fprintf(out, "\n"); fprintf(out, "Report bugs and comments to .\n"); } @@ -590,7 +533,6 @@ static struct option long_options[] = { { "ignore-dots", no_argument, 0, 'd' }, { "ignore-empty", no_argument, 0, '0' }, { "show-progress", no_argument, 0, 'p' }, - { "md5", no_argument, 0, 'm' }, { 0, 0, 0, 0 } }; @@ -637,17 +579,6 @@ int main(int argc, char **argv) { show_hits = 0; break; - case 'm': -#ifdef WITH_MD5 - use_md5 = 1; -#else - fprintf(stderr, - "finddup has not been compiled with MD5 hashing.\n"); - usage(stderr); - exit(EXIT_FAILURE); -#endif - break; - default: usage(stderr); exit(EXIT_FAILURE); -- 2.20.1