note
smahesh
<p>
Hi,
I wrote a similar script using the MD5 hash for detecting duplicates. It is not cleaned up or optimized - but does the intended job. It is now in my tool collection.
</p>
<code>
#!/usr/bin/perl
#
# Find duplicate files in specified directories using md5sum values to
# identify duplicates.
#
# (C) 2009 S.M.Mahesh
use strict;
use warnings;
use File::Find;
use Digest::MD5;
my $version = 0.1;
my %md5sums;
my $md5 = Digest::MD5->new();
sub Usage() {
print<<USAGEDOC;
$0 v$version - FindDuplicate script
USAGE:
$0 <DIR1> [DIR2...DIRn]
where,
DIR1..DIRn Specifies the directories to search
EXAMPLE:
$0 /home/user/downloads /home/user/documents
USAGEDOC
exit 1;
}
sub wanted {
return unless -f $File::Find::name; # Return if it is not a plain file
return if -l $File::Find::name; # Return in case this is a symlink
if (open(FILE, $File::Find::name) )
{
binmode(FILE);
my $sum = $md5->addfile(*FILE)->hexdigest();
close(FILE);
my $aref = $md5sums{$sum};
if ( defined $aref )
{
push @$aref, $File::Find::name;
}
else
{
my @list = ($File::Find::name);
$md5sums{$sum} = \@list;
}
}
else
{
print "ERROR: Could not open '$File::Find::name' for reading\n";
}
return;
}
Usage() if( $#ARGV < 0 );
foreach my $dir (@ARGV) {
print "$dir \n";
unless ( -d $dir ) {
print "ERROR: '$dir' is not a valid directory\n";
next;
}
find(\&wanted, $dir);
}
print "\n", '-'x25, "\n";
print "Printing duplicate files (if any)\n";
print '-'x25, "\n\n";
foreach my $sum (sort keys %md5sums) {
my $list = $md5sums{$sum};
if ($#$list > 0)
{
print "$sum :\n";
foreach my $file (@$list) {
print "\t $file\n";
}
print"\n"
}
}
print '-'x25, "\n";
</code>
<p>
Mahesh
</p>
855401
855678