#!/usr/bin/perl # natsume5.pl - Don Yang (uguu.org) # # 01/02/06 use Digest::MD5; sub DigestPart { ($dfile, $dsize) = @_; if( open INFILE, "< $dfile" ) { $ctx = Digest::MD5->new; if( $dsize ) { $ctx->addfile(*INFILE); $ReadBytes += $dsize; } else { read INFILE, $data, 1024; $ctx->add($data); $ReadBytes += length $data; } close INFILE; $rd = $ctx->digest; } else { print "# $dfile: Can not open: $!\n"; $rd = undef; } return $rd; } @list0 = @ARGV; @list0 = if $#ARGV < 0; chomp @list0; foreach $file (@list0) { @parts = split /\//, $file; next if $#parts < 0; @keep_parts = (); for($i = 0; $i <= $#parts; $i++) { next if $parts[$i] eq "."; if( $#keep_parts < 0 ) { push @keep_parts, $parts[$i]; } else { if( $parts[$i] ne ".." ) { push @keep_parts, $parts[$i]; } elsif( $keep_parts[$#keep_parts] eq "" || $keep_parts[$#keep_parts] eq ".." ) { push @keep_parts, $parts[$i]; } else { pop @keep_parts; } } } $file = join '/', @keep_parts; push @list, $file if $file ne '' && $file ne '.' && $file ne '..'; } if( $#list > 0 ) { @s = sort @list; @list = ($s[0]); foreach $file (@s) { push @list, $file if $file ne $list[$#list]; } } $FileCount = $ReadBytes = $TotalBytes = $DupCount = $DupBytes = 0; foreach $file (@list) { if( -f $file && -r _ ) { $size = -s _; $FileCount++; $TotalBytes += $size; if( $size <= 0 ) { print "ln -s -f /dev/null '$file'\n"; next; } if( !exists $Hash{$size} ) { $Hash{$size}{0} = $file; next; } if( exists $Hash{$size}{0} ) { $file0 = $Hash{$size}{0}; $header0 = DigestPart($file0, 0); delete $Hash{$size}{0}; $Hash{$size}{$header0}{0} = $file0; } $header = DigestPart($file, 0); if( !exists $Hash{$size}{$header} ) { $Hash{$size}{$header}{0} = $file; next; } if( exists $Hash{$size}{$header}{0} ) { $file0 = $Hash{$size}{$header}{0}; $digest0 = DigestPart($file0, -s $file0); delete $Hash{$size}{$header}{0}; $Hash{$size}{$header}{$digest0} = $file0; } $digest = DigestPart($file, $size); if( !exists $Hash{$size}{$header}{$digest} ) { $Hash{$size}{$header}{$digest} = $file; next; } $orig = $Hash{$size}{$header}{$digest}; if( $orig !~ m{^/} && $file !~ m{^/} ) { $target = $file; while( $orig =~ m{^([^/]+)/(.*)} ) { $orig_root = $1; $orig_subpath = $2; last if $target !~ m{^([^/]+)/(.*)}; last if $orig_root ne $1; $orig = $orig_subpath; $target = $2; } if( index($target, '/') + 1 ) { @parts = split /\//, $target; $orig = ("../" x $#parts) . $orig; } } print "ln -s -f '$orig' '$file'\n"; $DupCount++; $DupBytes += $size; } else { print "# $file: not readable\n"; } } print "# $FileCount files, $ReadBytes/$TotalBytes bytes read\n", ($DupCount > 0 ? "# $DupBytes bytes in $DupCount duplicate files\n" : "# No duplicates found\n");