Indeed in my own duplicate searching script (currently only deletes duplicates, but I plan to make it more flexible one day) I make clusters of files based on size since that is much lighter to take, and I calculate checksums within clusters to decide whether the files are identical or not. This is not 100% sure, as is well known, but is enough for me. If I ever decide to make it into a serious thing, I'd add an option for full comparison...
Funny: it seems quite about everybody rolled his or her own version of this thing...
For the interested, here follows the code.
Caveat: do not use it! It's quite old and I realize just now that I adopt
-l which is totally inadequate for a script of that length - I plan to "correct" it ASAP, I just don't have time now...
#!/usr/bin/perl -l
use strict;
use warnings;
use File::Find;
use Digest::MD5;
use Getopt::Std;
sub takemd5($);
my %opt;
getopts 'i:o:', \%opt;
@ARGV=grep { -d or !warn "`$_': not a directory!\n" } @ARGV;
die <<"EOD" unless @ARGV;
Usage: $0 [options] <dir> [<dirs>]
-i <file> read chached info from <file>
-o <file> write cached info to <file>
EOD
my %files;
if ($opt{i}) {
for ($opt{i}) {
-f or next;
open my $fh, '<', $_ or
die "Can't open `$_': $!\n";
while (<$fh>) {
chomp;
/(\d+)\s{2}([0-9a-z]{32})\s{2}(.*)/ or
warn "`$opt{i}': line $. not in the correct format\n" an
+d
next;
next if -f $3;
$files{$1}{$2}=1;
}
}
}
find { no_chdir => 1,
preprocess => sub {
sort {lc $a cmp lc $b} @_;
},
wanted => sub {
return unless -f and my $sz=-s;
for my $f ($files{$sz}) {
$f=$_ and return unless $f;
$f={takemd5 $f => $f} unless ref $f;
my $md5=takemd5 $_;
if ($f->{$md5}) {
unlink $_ and
print "Removing `$_'" or
warn "Can't remove `$_': $!\n";
} else {
$f->{$md5}=$_;
}
}
} }, @ARGV;
if ($opt{o}) {
open my $fh, '>>', $opt{o} or
die "Can't open `$opt{o}' for updating: $!\n";
for my $k (keys %files) {
next unless ref $files{$k};
print $fh $k, ' ',
$_, ' ', $files{$k}{$_} for
keys %{$files{$k}};
}
}
sub takemd5($) {
my $f=shift;
open my $fh, '<:raw', $f or
warn "Couldn't open `$f': $!\n";
Digest::MD5->new->addfile($fh)->hexdigest;
}
__END__