Category: |
Utility Scripts |
Author/Contact Info |
djw - djw@perldev.org |
Description: |
This will search out a directory recursively for duplicate files of a specified size (default: 100 MB). It logs everything and makes you dinner.
Enjoy,
djw
*update* 04.24.2002
After some suggestions by fellow monks, I have changed this to use Digest::MD5. Thanks everyone++.
djw |
#!/usr/bin/perl -w
use strict;
use File::stat;
use Digest::MD5;
use File::Find qw(finddepth);
use Time::HiRes qw(gettimeofday);
# -----------
# 1024 x 1024 = 1048576
# kbytes x 1024 = 1 MB
use constant MINFILESIZE => 104857600; # 100 MB
use constant ONEMEGABYTE => 1048576;
use vars qw(%fileInfo $number $totalSpace);
my ($totalFiles, $duplicateFiles) = 0;
my $dir = shift || &usage;
print STDOUT "\nRunning. This could take a few minutes....";
# -----------
# turn off buffering for errorlog et al.
$| = 1;
# -----------
# Redirecting standard error output to 'error.log' - can
# get large if there are permission issues during the
# recursive search.
open(OLDERR, ">&STDERR");
open(STDERR, '>', "error.log") || die "Can't redirect STDERR: ($!)\n";
select(STDERR);
# -----------
# I wanted to see how long it would take for this
# to search trough large volumes.
#
# 89.2 minutes to search through a mounted drive
# (130 GB of data) over a 100mbit switched network.
# Found 4 duplicates that were over 100 MB in size.
#
# 812.5 MB of total duplicated space.
my $beginRun = gettimeofday;
finddepth \&search, $dir;
my $endRun = gettimeofday;
my $runTime = $endRun - $beginRun;
# -----------
# translate seconds into appropriate time for display
# later. precise? nah...
if ($runTime > 60) {
$runTime = sprintf("%.2f minutes", $runTime / 60);
} elsif ($runTime > 3600) {
$runTime = sprintf("%.2f hours", $runTime / 3600);
} else {
$runTime = sprintf("%.2f seconds", $endRun - $beginRun);
}
print STDOUT "Complete.\n";
# -----------
# This writes file info to our 'duplicate.log' file.
# [filename], [size], [quantity] (greater than 1)
&write;
close(STDERR);
close(OLDERR);
sub search {
# -----------
# The Meat (tm).
#
# Using File::Find this recursively searches
# through each directory from the directory
# given at runtime. It checks to see if each
# file is of the size we are curious about.
#
# If it is, we get the MD5 digest info for the
# file to see if we already have it in our
# hash. If it exists, we increment the
# counter, if not, a new key gets created
# (using the MD5 digest).
if (-f) {
my $fsize = stat($_)->size;
if ($fsize > MINFILESIZE) {
open(MD5FILE, "$_") || warn "Can't open file ($_): ($!)\n"
+;
binmode(MD5FILE);
my $md5hash = Digest::MD5->new->addfile(*MD5FILE)->hexdige
+st;
close(MD5FILE);
if (exists($fileInfo{$md5hash})) {
$fileInfo{$md5hash}[2]{count} += 1;
} else {
$fileInfo{$md5hash}[0]{filename} = $_;
$fileInfo{$md5hash}[1]{size} = $fsize;
$fileInfo{$md5hash}[2]{count} = 1;
}
}
$totalFiles++;
}
}
sub write {
foreach (keys %fileInfo) {
if ($fileInfo{$_}[2]{count} < 2) {
delete $fileInfo{$_};
}
}
if (%fileInfo) {
open (LOG, "+>duplicates.log") || die "Can't create logfile: (
+$!)\n";
foreach (keys %fileInfo) {
next if ($fileInfo{$_}[2]{count} < 2);
$duplicateFiles++;
$number = sprintf("%.1f", $fileInfo{$_}[1]{size} / ONEMEGA
+BYTE);
my $duplicateSpace = $number * ($fileInfo{$_}[2]{count} -
+1);
$totalSpace += $duplicateSpace;
write(LOG);
}
close(LOG);
print STDOUT "\nFound $duplicateFiles/$totalFiles duplicate fi
+les.\n";
print STDOUT "Runtime: $runTime.\n";
print STDOUT "Duplicated Space: $totalSpace MB\n";
} else {
print STDOUT "\nNo duplicates found - 0/$totalFiles files.\n";
print STDOUT "Runtime: $runTime.\n";
}
}
sub usage {
print "Usage: ./duplicates.pl [dirname]\n";
print "\n";
print "BAD MR. KITTY!\n\nMake sure you supply a directory to searc
+h through!\n";
print "Example: ./duplicates.pl /home/foo/\n";
exit;
}
format LOG_TOP =
FILENAME SIZE QTY
-----------------------------------------------------------
.
format LOG =
@>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> @>>>> MB (@>)
$fileInfo{$_}[0]{filename}, $number, $fileInfo{$_}[2]{count}
.
|
Re: Duplicate file bounty hunter
by belg4mit (Prior) on Apr 24, 2002 at 01:02 UTC
|
| [reply] |
|
A previous discussion on this subject lies here: Find duplicate files.. Much the same conclusion: use Digest::MD5.
print@_{sort keys %_},$/if%_=split//,'= & *a?b:e\f/h^h!j+n,o@o;r$s-t%t#u'
| [reply] |
|
MD5 - never thought of using that, thanks for the tip. I downloaded your utility but have to take a look at it a bit later. Thanks belg4mit++.
djw
| [reply] |
Re: Duplicate file bounty hunter
by rob_au (Abbot) on Apr 24, 2002 at 02:27 UTC
|
I must agree wholeheartedly with the recommendation of belg4mit to explore the usage of Digest::MD5 for the comparison of files. The following is a small script that I wrote previously based upon a node by demerphq here which may be of use for comparative purposes.
#!/usr/bin/perl -wT
use Digest::MD5;
use File::Find;
use IO::File;
use strict;
$| = 1;
$ENV{'PATH'} = '/bin:/usr/bin:/usr/local/bin';
my $ctx = Digest::MD5->new;
my %digest;
my $path = $ARGV[0] || '.';
find ({
'wanted' => sub {
if (-f $_) {
lstat;
if ((-r _) && (!-l _)) {
$ctx->reset;
my $fh = IO::File->new($_, 'r');
$ctx->addfile(\$fh);
my $md5 = $ctx->hexdigest;
if (exists $digest{$md5}) {
push @{$digest{$md5}->{'dupes'}}, $_;
} else {
$digest{$md5} = {
'file' => $_,
'dupes' => []
}
}
}
} else {
print "Searching $_\n";
}
},
'no_chdir' => 1
}, $path);
print "There are ", ((scalar @{$digest{$_}->{'dupes'}}) || 0), " dupli
+cate files.\n";
exit 0;
| [reply] [d/l] |
|
|