#!/usr/bin/perl # natsume2.pl - Don Yang (uguu.org) # # 01/02/06 use strict; use Digest::MD5; my $HEADER_LENGTH = 1024; my (%Hash); my ($FileCount, $ReadBytes, $TotalBytes); my ($DupCount, $DupBytes); sub DigestPart($$) { my ($file, $size) = @_; local (*INFILE); unless( open INFILE, "< $file" ) { print "# $file: Can not open: $!\n"; return undef; } my $ctx = Digest::MD5->new; if( $size < 0 ) { my $data; read INFILE, $data, $HEADER_LENGTH; $ctx->add($data); $ReadBytes += length $data; } else { $ctx->addfile(*INFILE); $ReadBytes += $size; } close INFILE; return $ctx->digest; } sub FindCollision($$) { my ($file, $size) = @_; unless( exists $Hash{$size} ) { $Hash{$size}{''} = $file; return undef; } if( exists $Hash{$size}{''} ) { my $file0 = $Hash{$size}{''}; my $header0 = DigestPart($file0, -1); delete $Hash{$size}{''}; $Hash{$size}{$header0}{''} = $file0; } my $header = DigestPart($file, -1); unless( exists $Hash{$size}{$header} ) { $Hash{$size}{$header}{''} = $file; return undef; } if( exists $Hash{$size}{$header}{''} ) { my $file0 = $Hash{$size}{$header}{''}; my $digest0 = DigestPart($file0, -s $file0); delete $Hash{$size}{$header}{''}; $Hash{$size}{$header}{$digest0} = $file0; } my $digest = DigestPart($file, $size); unless( exists $Hash{$size}{$header}{$digest} ) { $Hash{$size}{$header}{$digest} = $file; return undef; } return $Hash{$size}{$header}{$digest}; } my (@list0); if( $#ARGV < 0 ) { while() { chomp; push @list0, $_; } } else { @list0 = @ARGV; } # Canonicalize my (@list); foreach my $file (@list0) { my @parts = split /\//, $file; next if $#parts < 0; my @keep_parts = (); for(my $i = 0; $i <= $#parts; $i++) { if( $parts[$i] eq "." ) { next; } if( $#keep_parts < 0 ) { push @keep_parts, $parts[$i]; } else { if( $parts[$i] ne ".." ) { push @keep_parts, $parts[$i]; } elsif( $keep_parts[$#keep_parts] eq "" || $keep_parts[$#keep_parts] eq ".." ) { push @keep_parts, $parts[$i]; } else { pop @keep_parts; } } } $file = join '/', @keep_parts; push @list, $file unless( $file eq '' || $file eq '.' || $file eq '..' ); } # Uniq if( $#list > 0 ) { my (@s) = sort @list; @list = ($s[0]); foreach my $file (@s) { push @list, $file if $file ne $list[$#list]; } } # ProcessFiles(@list); $FileCount = $ReadBytes = $TotalBytes = $DupCount = $DupBytes = 0; foreach my $file (@list) { unless( -e $file ) { print "# $file: not found\n"; next; } unless( -f _ ) { print "# $file: not a file\n"; next; } unless( -r _ ) { print "# $file: not readable\n"; next; } my $size = -s _; $FileCount++; $TotalBytes += $size; if( $size <= 0 ) { print "ln -s -f /dev/null '$file'\n"; next; } my $file0 = FindCollision($file, $size); if( defined $file0 ) { # PrintCollision($file0, $file); my ($orig, $new) = ($file0, $file); if( $orig !~ m{^/} && $new !~ m{^/} ) { my $target = $new; while( $orig =~ m{^([^/]+)/(.*)} ) { my ($orig_root, $orig_subpath) = ($1, $2); last if( $target !~ m{^([^/]+)/(.*)} ); my ($target_root, $target_subpath) = ($1, $2); if( $orig_root eq $target_root ) { $orig = $orig_subpath; $target = $target_subpath; } else { last; } } if( index($target, '/') >= 0 ) { my @parts = split /\//, $target; $orig = ("../" x $#parts) . $orig; } } print "ln -s -f '$orig' '$new'\n"; $DupCount++; $DupBytes += $size; } } print "# $FileCount files, $ReadBytes/$TotalBytes bytes read\n"; if( $DupCount > 0 ) { print "# $DupBytes bytes in $DupCount duplicate files\n"; } else { print "# No duplicates found\n"; }