/* natsume.c - Don Yang (uguu.org) Ported from natsume0.pl, go there for more comments. Doesn't work with files larger than 2GB. 12/27/05 */ /*@ -compdestroy -mustfreeonly -temptrans @*/ #include #include #include #include #include "hash.h" #include "md5.h" #include "path.h" #include "lines.h" #define HEADER_LENGTH 1024 #define READ_BLOCK_SIZE 0x10000 #ifdef _WIN32 #define FORMAT64 "%I64u" typedef unsigned __int64 uint64; #else #define FORMAT64 "%llu" typedef unsigned long long uint64; #endif /* Three level hash tables */ typedef struct { byte d[MD5_DIGEST_LENGTH]; } ContentHashKey; typedef /*@observer@*//*@notnull@*/char* ContentHashValue; typedef struct { byte d[MD5_DIGEST_LENGTH]; } HeaderHashKey; typedef struct { /*@only@*//*@null@*/HashTable *content_hash; /*@observer@*//*@notnull@*/char *file_name; } HeaderHashValue; typedef long SizeHashKey; typedef struct { /*@only@*//*@null@*/HashTable *header_hash; /*@observer@*//*@notnull@*/char *file_name; } SizeHashValue; typedef struct { /*@only@*//*@notnull@*/HashTable *size_hash; int file_count, dup_count; uint64 dup_size, read_size, total_size; } FileSet; static long GetFileSize(char *filename); static int DigestAll(/*@out@*/byte *digest, char *filename, uint64 *readsize); static int DigestHeader(/*@out@*/byte *digest, char *filename, uint64 *readsize); static /*@observer@*//*@null@*/char *FindContentCollision(HeaderHashValue *h, char *filename, uint64 *readsize); static /*@observer@*//*@null@*/char *FindHeaderCollision(SizeHashValue *h, char *filename, uint64 *readsize); static /*@observer@*//*@null@*/char *FindCollision(FileSet *fs, char *filename, long filesize); static void FreeHeaderSizeValue(void *x); static void FreeSizeHashValue(void *x); static /*@only@*/FileSet *FileSet_create(size_t size); static void FileSet_destroy(/*@only@*/FileSet *fs) /*@releases fs@*/; int main(int argc, char **argv) { StringList *s; FileSet *fs; char *file0, *file1, *relative_file; long filesize; size_t i; if( argc > 1 ) { /* Get list of files from command line arguments */ s = StringList_create(0U); for(i = 1; (int)i < argc; i++) (void)CanonicalizePath(StringList_append(s, argv[i])); } else { /* Get list of files from stdin */ s = StringList_from_file(stdin); for(i = 0; i < StringList_size(s); i++) (void)CanonicalizePath(StringList_get(s, i)); } StringList_sort(s); StringList_uniq(s); fs = FileSet_create((size_t)StringList_size(s)); for(i = 0; i < StringList_size(s); i++) { file1 = StringList_get(s, i); filesize = GetFileSize(file1); if( filesize < 0 ) continue; fs->file_count++; if( filesize == 0 ) { printf("ln -s -f /dev/null '%s'\n", file1); continue; } fs->total_size += (uint64)filesize; file0 = FindCollision(fs, file1, filesize); if( file0 != NULL ) { relative_file = RelativePath(file1, file0); printf("ln -s -f '%s' '%s'\n", relative_file, file1); free(relative_file); fs->dup_count++; fs->dup_size += (uint64)filesize; } } printf("# %d files, "FORMAT64"/"FORMAT64" bytes read\n", fs->file_count, fs->read_size, fs->total_size); if( fs->dup_count > 0 ) { printf("# "FORMAT64" bytes in %d duplicate files\n", fs->dup_size, fs->dup_count); } else { (void)puts("# No duplicates found"); } FileSet_destroy(fs); StringList_destroy(s); return 0; } /* Return size of file */ static long GetFileSize(char *filename) { FILE *infile; long size; if( (infile = fopen(filename, "rb")) == NULL ) { printf("# %s: Can not open: %d\n", filename, errno); return -1; } if( fseek(infile, 0, SEEK_END) == 0 ) { size = ftell(infile); } else { printf("# %s: Can not seek: %d\n", filename, errno); size = -1; } (void)fclose(infile); return size; } /* Compute MD5 for the entire file, returns 0 if successful. */ static int DigestAll(/*@out@*/byte *digest, char *filename, uint64 *readsize) { byte buffer[READ_BLOCK_SIZE]; MD5_CTX ctx; FILE *infile; size_t read_block_size; if( (infile = fopen(filename, "rb")) == NULL ) { printf("# %s: Can not open: %d\n", filename, errno); memset(digest, 0, MD5_DIGEST_LENGTH); return 1; } MD5Init(&ctx); do { read_block_size = fread(buffer, 1, READ_BLOCK_SIZE, infile); MD5Update(&ctx, buffer, (uint32)read_block_size); *readsize += (uint64)read_block_size; } while( read_block_size > 0 ); MD5Final(digest, &ctx); (void)fclose(infile); return 0; } /* Compute MD5 for first part of file, returns 0 if successful. */ static int DigestHeader(/*@out@*/byte *digest, char *filename, uint64 *readsize) { byte buffer[HEADER_LENGTH]; MD5_CTX ctx; FILE *infile; size_t read_block_size; if( (infile = fopen(filename, "rb")) == NULL ) { printf("# %s: Can not open: %d\n", filename, errno); memset(digest, 0, MD5_DIGEST_LENGTH); return 1; } read_block_size = fread(buffer, 1, HEADER_LENGTH, infile); (void)fclose(infile); MD5Init(&ctx); MD5Update(&ctx, buffer, (uint32)read_block_size); MD5Final(digest, &ctx); *readsize += (uint64)read_block_size; return 0; } /* Check for same content collisions. Return original file name if found, otherwise update hash table and return NULL. */ static /*@observer@*//*@null@*/char *FindContentCollision( HeaderHashValue *h, char *filename, uint64 *readsize) { ContentHashKey key; ContentHashValue *value; /* Force delayed content digest computation */ if( h->content_hash == NULL ) { if( DigestAll(key.d, h->file_name, readsize) != 0 ) return NULL; h->content_hash = HashTable_create(1); HashTable_add(h->content_hash, &key, sizeof(ContentHashKey), &(h->file_name), sizeof(ContentHashValue)); } /* Find existing key */ if( DigestAll(key.d, filename, readsize) != 0 ) return NULL; value = (ContentHashValue*)HashTable_find( h->content_hash, &key, sizeof(ContentHashKey), NULL); if( value != NULL ) return *value; /* Add new key */ HashTable_add(h->content_hash, &key, sizeof(ContentHashKey), &filename, sizeof(ContentHashValue)); return NULL; } /* Check for same header collisions. Return original file name if found, otherwise update hash table and return NULL. */ static /*@observer@*//*@null@*/char *FindHeaderCollision( SizeHashValue *h, char *filename, uint64 *readsize) { HeaderHashKey key; HeaderHashValue value0, *value; /* Force delayed header digest computation */ if( h->header_hash == NULL ) { if( DigestHeader(key.d, h->file_name, readsize) != 0 ) return NULL; h->header_hash = HashTable_create(1); value0.content_hash = NULL; value0.file_name = h->file_name; HashTable_add(h->header_hash, &key, sizeof(HeaderHashKey), &value0, sizeof(HeaderHashValue)); } /* Find existing key */ if( DigestHeader(key.d, filename, readsize) != 0 ) return NULL; value = (HeaderHashValue*)HashTable_find( h->header_hash, &key, sizeof(HeaderHashKey), NULL); if( value != NULL ) return FindContentCollision(value, filename, readsize); /* Add new key */ value0.content_hash = NULL; value0.file_name = filename; HashTable_add(h->header_hash, &key, sizeof(HeaderHashKey), &value0, sizeof(HeaderHashValue)); return NULL; } /* Check for same size collisions. Return original file name if found, otherwise update hash table and return NULL. */ static /*@observer@*//*@null@*/char *FindCollision( FileSet *fs, char *filename, long filesize) { SizeHashValue value0, *value; /* Find existing key */ value = (SizeHashValue*)HashTable_find( fs->size_hash, &filesize, sizeof(long), NULL); if( value == NULL ) { /* No file with same size, add file and return */ value0.header_hash = NULL; value0.file_name = filename; HashTable_add(fs->size_hash, &filesize, sizeof(long), &value0, sizeof(SizeHashValue)); return NULL; } return FindHeaderCollision(value, filename, &(fs->read_size)); } /* Release content hash tables */ static void FreeHeaderSizeValue(void *x) { HeaderHashValue *value = (HeaderHashValue*)x; if( value->content_hash != NULL ) HashTable_destroy(value->content_hash); free(value); } /* Release header hash tables */ static void FreeSizeHashValue(void *x) { SizeHashValue *value = (SizeHashValue*)x; if( value->header_hash != NULL ) HashTable_destroy_custom(value->header_hash, FreeHeaderSizeValue); free(value); } /* Create empty file set */ static /*@only@*/FileSet *FileSet_create(size_t size) { FileSet *fs; if( (fs = (FileSet*)calloc(1, sizeof(FileSet))) == NULL ) abort(); fs->size_hash = HashTable_create(size); return fs; } /* Release file set */ static void FileSet_destroy(/*@only@*/FileSet *fs) /*@releases fs@*/ { HashTable_destroy_custom(fs->size_hash, FreeSizeHashValue); free(fs); }