.TP
\fB-i\fR, \fB--same-inodes-are-different\fR
files with same inode are considered as different
+.TP
+\fB-m\fR, \fB--md5\fR
+use MD5 hashing
.SH "BUGS"
None known, probably many. Valgrind does not complain though.
+The MD5 hashing often hurts more than it helps, hence it is off by
+default. The only case when it should really be useful is when you
+have plenty of different files of same size, which does not happen
+often.
+
.SH "WISH LIST"
The format of the output should definitely be improved. Not clear how.
-The comparison algorithm could maybe be improved with some MD5 kind of
-signature. However, most of the time is taken by comparison for
-matching files, which are required even when using a hash.
-
Their could be some fancy option to link two instances of the command
running on different machines to reduce network disk accesses. Again,
this may not help much, for the reason given above.
*
*/
-#define VERSION_NUMBER "0.8"
+#define VERSION_NUMBER "0.9"
#define _BSD_SOURCE
#include <locale.h>
#include <getopt.h>
#include <fcntl.h>
+#include <openssl/md5.h>
/* 1M really helps compared to 64k */
#define READ_BUFFER_SIZE (1024 * 1024)
int tty_width = -1; /* Positive value means what width to use to show
the progress bar */
+int use_md5 = 0; /* 1 means we keep an MD5 signature for each file */
+
/********************************************************************/
/* malloc with error checking. */
ino_t inode;
int group_id; /* one per identical file content */
int dir_id; /* 1 for DIR1, and 2 for DIR2 */
+ int md5_computed;
+ unsigned char md5[MD5_DIGEST_LENGTH];
};
void file_list_delete(struct file_node *head) {
int same_content(struct file_node *f1, struct file_node *f2,
char *buffer1, char *buffer2) {
int fd1, fd2, s1, s2;
+ MD5_CTX c1, c2;
+
+ if(use_md5) {
+ if(f1->md5_computed && f2->md5_computed &&
+ !memcmp(f1->md5, f2->md5, MD5_DIGEST_LENGTH)) {
+ return 0;
+ } else {
+ if(!f1->md5_computed) {
+ MD5_Init(&c1);
+ }
+ if(!f2->md5_computed) {
+ MD5_Init(&c2);
+ }
+ }
+ }
fd1 = open(f1->name, O_RDONLY);
fd2 = open(f2->name, O_RDONLY);
if(s1 == 0) {
close(fd1);
close(fd2);
+ if(use_md5) {
+ if(!f1->md5_computed) {
+ MD5_Final(f1->md5, &c1);
+ f1->md5_computed = 1;
+ }
+ if(!f2->md5_computed) {
+ MD5_Final(f2->md5, &c2);
+ f2->md5_computed = 1;
+ }
+ }
return 1;
} else {
+ if(use_md5) {
+ if(!f1->md5_computed) {
+ MD5_Update(&c1, buffer1, s1);
+ }
+ if(!f2->md5_computed) {
+ MD5_Update(&c2, buffer2, s2);
+ }
+ }
if(memcmp(buffer1, buffer2, s1)) {
close(fd1);
close(fd2);
tmp->inode = sb.st_ino;
tmp->group_id = -1;
tmp->dir_id = -1;
+ tmp->md5_computed = 0;
tail = tmp;
}
}
fprintf(out, " -r, --real-paths show the real file paths\n");
fprintf(out, " -i, --same-inodes-are-different\n");
fprintf(out, " consider files with same inode as different\n");
+ fprintf(out, " -m, --md5 use MD5 hashing\n");
fprintf(out, "\n");
fprintf(out, "Report bugs and comments to <francois@fleuret.org>.\n");
}
{ "ignore-dots", no_argument, 0, 'd' },
{ "ignore-empty", no_argument, 0, '0' },
{ "show-progress", no_argument, 0, 'p' },
+ { "md5", no_argument, 0, 'm' },
{ 0, 0, 0, 0 }
};
setlocale (LC_ALL, "");
- while ((c = getopt_long(argc, argv, "hircgd0p",
+ while ((c = getopt_long(argc, argv, "hircgd0pm",
long_options, NULL)) != -1) {
switch (c) {
show_hits = 0;
break;
+ case 'm':
+ use_md5 = 1;
+ break;
+
default:
exit(EXIT_FAILURE);
}