#!/usr/bin/perl # natsume0.pl - Don Yang (uguu.org) # # Check for duplicate file from command line arguments # ./natsume0.pl # # or from stdin (more useful when there are lots of files): # find -type f | ./natsume0.pl # # Algorithm is based on the following observations: # 1. Most files are different. If this wasn't the case then we would # always compute MD5s of the full file and keep them around. # 2. Files with different sizes are always different, and stat # operation is always cheaper than reading the file. This means we # should always check file size first before comparing contents. # 3. Most files have high enough entropy. This means two different # files will likely be different in part of the file, so we don't # have to read the entire file all the time. # # Thus the algorithm maintains a hierarchy of file names and MD5s, # only computing the MD5 if there exists a collision at the same # level. If the deepest level has a collision, assume that the files # are duplicates and print out a message. # # Seems like a good idea to increase the number of hierarchy levels # with respect to the file sizes, but we assume for high entropy # files, if they are partially the same then they will likely be all # the same, so 2 levels of MD5s should be sufficient (it's enough for # RAR volumes and JPEG images, for example). # # Didn't try it for files larger than 2GB, probably doesn't work. # # 12/25/05 use strict; use Digest::MD5; # How much header bytes to read my $HEADER_LENGTH = 1024; # 3 level hash of file names: # size -> MD5 of first 1k -> MD5 of full file my (%Hash); # Statistics my ($FileCount, $ReadBytes, $TotalBytes); my ($DupCount, $DupBytes); # Remove duplicate file names in a sorted list sub Uniq(@) { my (@list, $file); foreach $file (@_) { if( $#list < 0 ) { @list = ($file); } else { if( $file ne $list[$#list] ) { push @list, $file; } } } return @list; } # Remove . and .. in paths, sort list and remove duplicates. sub Canonicalize(@) { my (@list); foreach my $file (@_) { my @parts = split /\//, $file; next if $#parts < 0; # Empty string - silent error my @keep_parts = (); for(my $i = 0; $i <= $#parts; $i++) { # Remove . components if( $parts[$i] eq "." ) { next; } # Remove .. components if( $#keep_parts < 0 ) { push @keep_parts, $parts[$i]; } else { if( $parts[$i] ne ".." ) { push @keep_parts, $parts[$i]; } elsif( $keep_parts[$#keep_parts] eq "" || $keep_parts[$#keep_parts] eq ".." ) { # parent eq '' catches the "/../path" case. No error # message is displayed, but we are guaranteed to fail # later due to the illegal file name. push @keep_parts, $parts[$i]; } else { pop @keep_parts; } } } $file = join '/', @keep_parts; push @list, $file unless( $file eq '' || $file eq '.' || $file eq '..' ); } return Uniq(sort @list); } # Compute MD5 for first 1k of data sub DigestHeader($) { my ($file) = @_; local (*INFILE); unless( open INFILE, "< $file" ) { print "# $file: Can not open: $!\n"; return undef; } my $data; read INFILE, $data, $HEADER_LENGTH; close INFILE; my $ctx = Digest::MD5->new; $ctx->add($data); $ReadBytes += length $data; return $ctx->digest; } # Compute MD5 for the entire file sub DigestAll($$) { my ($file, $size) = @_; local (*INFILE); unless( open INFILE, "< $file" ) { print "# $file: Can not open: $!\n"; return undef; } my $ctx = Digest::MD5->new; $ctx->addfile(*INFILE); $ReadBytes += $size; close INFILE; return $ctx->digest; } # Find name of file that has the same digest, undef if none. sub FindCollision($$) { my ($file, $size) = @_; # Check file with same size first unless( exists $Hash{$size} ) { # First time seeing file of this size, compute MD5 next time $Hash{$size}{''} = $file; return undef; } # Same size files found if( exists $Hash{$size}{''} ) { # Only one other file of the same size found so far, compute MD5 # for this file. my $file0 = $Hash{$size}{''}; my $header0 = DigestHeader($file0); delete $Hash{$size}{''}; $Hash{$size}{$header0}{''} = $file0; } my $header = DigestHeader($file); unless( exists $Hash{$size}{$header} ) { # No other file with same header data $Hash{$size}{$header}{''} = $file; return undef; } # Files with same header found if( exists $Hash{$size}{$header}{''} ) { # Only one other file with the same header found so far, compute # MD5 for the entire file. my $file0 = $Hash{$size}{$header}{''}; my $digest0 = DigestAll($file0, -s $file0); delete $Hash{$size}{$header}{''}; $Hash{$size}{$header}{$digest0} = $file0; } my $digest = DigestAll($file, $size); unless( exists $Hash{$size}{$header}{$digest} ) { # No other file with same contents $Hash{$size}{$header}{$digest} = $file; return undef; } return $Hash{$size}{$header}{$digest}; } # Print command to resolve collision sub PrintCollision($$) { my ($orig, $new) = @_; # Compute relative path from new file to original file if( $orig !~ m{^/} && $new !~ m{^/} ) { my $target = $new; while( $orig =~ m{^([^/]+)/(.*)} ) { my ($orig_root, $orig_subpath) = ($1, $2); last if( $target !~ m{^([^/]+)/(.*)} ); my ($target_root, $target_subpath) = ($1, $2); if( $orig_root eq $target_root ) { $orig = $orig_subpath; $target = $target_subpath; } else { last; } } if( index($target, '/') >= 0 ) { my @parts = split /\//, $target; $orig = ("../" x $#parts) . $orig; } } print "ln -s -f '$orig' '$new'\n"; } # Process sorted list of files and check for collisions sub ProcessFiles(@) { $FileCount = $ReadBytes = $TotalBytes = $DupCount = $DupBytes = 0; foreach my $file (@_) { unless( -e $file ) { print "# $file: not found\n"; next; } unless( -f _ ) { print "# $file: not a file\n"; next; } unless( -r _ ) { print "# $file: not readable\n"; next; } my $size = -s _; $FileCount++; $TotalBytes += $size; if( $size <= 0 ) { # Empty file print "ln -s -f /dev/null '$file'\n"; next; } my $file0 = FindCollision($file, $size); if( defined $file0 ) { PrintCollision($file0, $file); $DupCount++; $DupBytes += $size; } } print "# $FileCount files, $ReadBytes/$TotalBytes bytes read\n"; if( $DupCount > 0 ) { print "# $DupBytes bytes in $DupCount duplicate files\n"; } else { print "# No duplicates found\n"; } } # Get list of files to process in sorted order. Sorting the files # leads to more deterministic behavior on which file gets replaced. if( $#ARGV < 0 ) { # List of file names from stdin my (@list); while() { chomp; push @list, $_; } ProcessFiles(Canonicalize(@list)); } else { # List of files from command like arguments ProcessFiles(Canonicalize(@ARGV)); }