So, I was running out of disk space on a partition, and I remembered that I had a perl script that would find all duplicated files for me, that I'd found somewhere about 6 or 7 ago, when I was first playing with Perl 4, but didn't really know how to do much.
So I dug it out, read it, and realised how horrible it
was. I was tempted to rewrite it, but instead I decided
to google for "perl duplicate files" first. I found a couple of other scripts there, but they were pretty horrible too. In particular the first file there, which is basically a comparison between doing in perl vs shell, does a checksum hashing on every file. So I decided I would indeed write my own, which turned out to be about 7 times faster that this one (which was in turn twice as fast as my original script):
#!/usr/bin/perl -w
use strict;
use File::Find;
use Digest::MD5;
my %files;
my $wasted = 0;
find(\&check_file, $ARGV[0] || ".");
local $" = ", ";
foreach my $size (sort {$b <=> $a} keys %files) {
next unless @{$files{$size}} > 1;
my %md5;
foreach my $file (@{$files{$size}}) {
open(FILE, $file) or next;
binmode(FILE);
push @{$md5{Digest::MD5->new->addfile(*FILE)->hexdigest}},$file;
}
foreach my $hash (keys %md5) {
next unless @{$md5{$hash}} > 1;
print "$size: @{$md5{$hash}}\n";
$wasted += $size * (@{$md5{$hash}} - 1);
}
}
1 while $wasted =~ s/^([-+]?\d+)(\d{3})/$1,$2/;
print "$wasted bytes in duplicated files\n";
sub check_file {
-f && push @{$files{(stat(_))[7]}}, $File::Find::name;
}
Tony
Re: Find duplicate files.
by mwp (Hermit) on Jan 02, 2001 at 21:31 UTC
|
Very nice!
If the other monks here think it's solid and all, you
should OO it and send it to the author of File::Find as
File::Find::Duplicates. =) | [reply] [Watch: Dir/Any] |
Re: Find duplicate files.
by lemming (Priest) on Jun 02, 2001 at 20:42 UTC
|
Interesting. I just went through the similar problem of
combining four computer's worth of archives. In some
cases I had near duplicates due to slight doc changes and
the like, so I wanted a bit more information. I had a second
program do the deletes. (About 9,000 files)
I couldn't go by dates, due to bad file management
Note that the file statement uses the 3 arg version.
I had some badly named files such as ' ha'. I wish I could
remember the monk name that pointed out the documentation
for me.
#!/usr/bin/perl
# allstat.pl
use warnings;
use strict;
use File::Find;
use File::Basename;
use Digest::MD5;
my %hash;
my @temp;
while (my $dir = shift @ARGV) {
die "Give me a directory to search\n" unless (-d "$dir");
File::Find::find (\&wanted,"$dir");
}
exit;
sub wanted {
return unless (-f $_);
my $md5;
my $base = File::Basename::basename($File::Find::name, "");
my $size = -s "$base";
if ($size >= 10000000) { # They slowed down the check enough that I
+ skip them
if ($size >= 99999999) { $size = 99999999; }
$md5 = 'a'x32; # At this point I'll just hand check, less than a
+dozen files
}
else {
$md5 = md5file("$base");
}
if ($File::Find::name =~ /\t/) { # Just in case, this screws up our
+tab delimited file
warn "'$File::Find::name' has tabs in it\n";
}
printf("%32s\t%8d\t%s\t%s\n", $md5, $size, $File::Find::name, $base)
+;
}
sub md5file {
my ($file) = @_;
unless (open FILE, "<", "$file" ) {
warn "Can't open '$file': $!";
return -1; #Note we don't want to die just because of one file.
}
binmode(FILE);
my $chksum = Digest::MD5->new->addfile(*FILE)->hexdigest;
close(FILE);
return $chksum;
}
| [reply] [Watch: Dir/Any] [d/l] |
|
#!/usr/bin/perl -w
use strict;
use IO::File;
use Compress::Zlib ();
use Digest::MD5;
use Benchmark;
use constant BUFSIZE => 32768;
sub crc32
{
my $fh = shift;
binmode($fh);
sysseek($fh, 0, 0); # rewind
my $buffer = ' ' x BUFSIZE;
my $crc = 0;
while ($fh->sysread($buffer, BUFSIZE))
{
$crc = Compress::Zlib::crc32($buffer, $crc);
}
return $crc;
}
sub md5
{
my $fh = shift;
seek($fh, 0, 0); # rewind
my $md5 = Digest::MD5->new();
$md5->addfile($fh);
return $md5->digest;
}
foreach my $file (@ARGV)
{
my $fh = IO::File->new($file);
binmode($fh);
next if !defined($fh);
Benchmark::cmpthese(-10, {
"crc32 $file", sub { crc32($fh) },
"md5 $file", sub { md5($fh) }
});
}
| [reply] [Watch: Dir/Any] [d/l] |
|
I am new to perl (and to writing code) and I have just been in an excellent
course organized by Barcelona_pm. I have rewritten lemming code as an exercise of using Moose. To improve speed, following above suggestions, files with similar size are first identified and, afterwards, md5 value is calculated in these files. Because this is baby-code, please feell free to recomend any RTFM $manual that I sould review to improve the code. Thanks for this great language!
(I have to thank Alba from Barcelona_pm for suggestions on how to improve the code).
This is the definition of the object "FileDups"
package FileDups;
use Digest::MD5;
use Moose;
use namespace::autoclean;
has 'name' => (is => 'ro', isa => 'Str', required => 1,);
has 'pathname' => (is => 'ro', isa => 'Str', required => 1,);
has 'max_size' => (is => 'ro', isa => 'Num', required => 1,);
has 'big' => (is => 'rw', isa => 'Bool', required => 1, default =
+> 0);
has 'unread' => (is => 'rw', isa => 'Bool', required => 1, default =
+> 0);
has 'dupe' => (is => 'rw', isa => 'Bool', required => 1, default =
+> 0);
has 'md5' => (is => 'ro', isa => 'Str', lazy => 1, builder =
+> '_calculate_md5');
has 'size' => (is => 'ro', isa => 'Num', lazy => 1, builder =
+> '_calculate_size');
sub _calculate_size {
my $self = shift;
my $size = -s $self->name;
if (-s $self->name > $self->max_size) {
$size = $self->max_size;
$self->big(1);
}
return $size;
}
sub _calculate_md5 {
my $self = shift;
my $file = $self->pathname;
my $size = $self->size;
my $chksum = 0;
if ($size == $self->max_size) {
$chksum = 'a'x32;
} else {
my $fh;
unless (open $fh, "<", "$file" ) {
$self->unread(1);
return -1; #return -1 and exit from subrutine if file can
+not be opened
}
binmode($fh);
$chksum = Digest::MD5->new->addfile($fh)->hexdigest;
close($fh);
}
return $chksum;
}
;1
And this is the main package that lists duplicate files, big files and unread files.
#!/usr/bin/env perl
# References:
# http://drdobbs.com/web-development/184416070
use strict;
use warnings;
use File::Find;
use lib qw(lib);
use FileDups;
use Data::Dumper;
# Hash of => [array of [array]], [array of objects]
my (%dup, %sizes, @object, $number_files, $number_size_dups);
my $max_size = 99999999; # Size above of whitch md5 will n
+ot be calculated
my $return = "Press return to continue \n\n";
my $line = "-"x70 . "\n";
while (my $dir = shift @ARGV) { # Find and classify files
die "\"$dir\" is not a directory. Give me a directory to search\n"
+ unless (-d "$dir");
File::Find::find (\&wanted,"$dir");
}
print "\n";
foreach (@object) { # Calculates md5 for files with equ
+al size
if ($sizes{$_->size} == "1") {
$number_size_dups += 1; print "$number_size_dups Files with th
+e same size \r";
$_->dupe(1); # The object has another object with t
+he same size
$_->md5; # Calculates md5
}
}
foreach (@object) { # Creates a hash of md5 values
if ($_->dupe == 1) { # for files with the same size
if (exists $dup{$_->md5}) {
push @{$dup{$_->md5}}, [$_->size, $_->name, $_->pathname];
} else {
$dup{$_->md5} = [ [$_->size, $_->name, $_->pathname] ];
}
}
}
print "\n\nDuplicated files\n $line $return"; my $pausa4 = <>;
foreach (sort keys %dup)
{ # sort hash by md5sum
if ($#{$dup{$_}} > 0) # $_ = keys
{ # if we have more than 1 array whithin th
+e same hash
printf("\n%8s %10.10s %s\n", "Size", "Name", "Pathname");
foreach ( @{$dup{$_}} ) # $_ = keys, $dupes{keys} =
+list of references (scalars)
{ # iterate trough the first dimension of t
+he array
printf("%8d %10.10s %s\n",@{$_}); # dereference referen
+ce to array
}
}
}
my $r1 = &list_files("Big files","big",@object); # List big files
my $r2 = &list_files("Unread files","unread",@object); # List unrea
+d files
sub wanted {
return unless (-f $_);
my $file = FileDups->new(name => $_, pathname => $File::Find::name
+, max_size => $max_size);
$number_files += 1; print "$number_files Files seen\r";
if ($file->size == $max_size) { # Identifies big files
$sizes{$file->size} = "0"; # We do not check md5 for bi
+g files
} elsif (exists $sizes{$file->size}) { # There are more the
+n one file with this size
$sizes{$file->size} = "1";
} else {
$sizes{$file->size} = "0"; # This is a new size value,
+not duplicated
}
push @object, $file; # Puts the object in the @obje
+ct array
}
sub list_files { # List objects according to criter
+ia:
my ($title,$criteria,@object) = @_; # (a) big files; (b)
+unread files
print "\n \n $title \n" . $line; my $pausa = <>;
foreach (@object) {
if ($_->$criteria) {
printf(" %10.10s %s\n",$_->name,$_->pathname);
}
}
print $line;
}
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
thanks to lemming's code for generating md5 hashes above, It became the first part in finding duplicates for me. I used the following code to find duplicates and show them. Running the same code again with 'remove' will 'move' all the duplicates to a ./trash/ subdirectory. Its a little too specific based on my specific needs, but might be a nice start for someone else needing the same.
It went through 25k files, finding 11k duplicates, moving them to a ./trash/ directory in about 60 seconds.
this code below takes the output of lemmings code above.
#!/usr/bin/perl -w
# usesage: dupDisplay.pl fileMD5.txt [remove]
# input file has the following form:
# 8e773d2546655b84dd1fdd31c735113e 304048 /media/PICTURES-1/my
+media/pictures/pics/20041004-kids-camera/im001020.jpg im001020.jpg
# e01d4d804d454dd1fb6150fc74a0912d 296663 /media/PICTURES-1/my
+media/pictures/pics/20041004-kids-camera/im001021.jpg im001021.jpg
use strict;
use warnings;
my %seen;
my $fileCNT = 0;
my $origCNT = 0;
my $delCNT = 0;
my $failCNT = 0;
my $remove = 'remove' if $ARGV[1];
$remove = '' if !$ARGV[1];
print "\n\n ... running in NON removal mode.\n\n" if !$remove;
open IN,"< $ARGV[0]" or die ".. we don't see a file to read: $ARGV[0]"
+;
open OUT,"> $ARGV[0]_new.temp" or die ".. we can't write the file: $AR
+GV[0]_new.temp";
open OUTdel,"> $ARGV[0]_deleted" or die ".. we can't write the file: $
+ARGV[0]_deleted";
open OUTfail,"> $ARGV[0]_failed" or die ".. we can't write the file: $
+ARGV[0]_failed";
print "\n ... starting to read find duplicats in: $ARGV[0]\n";
if(! -d './trash/'){mkdir './trash/' or die " !! couldn't make trash d
+irectory.\n $! \n";}
while(<IN>){
my $line = $_;
chomp $line;
$fileCNT++;
my ($md5,$filesize,$pathfile,$file) = split /\t+/,$line,4;
if(exists $seen{"$md5:$filesize"}){
my $timenow = time;
my $trashFile = './trash/' . $file . "_" . $timenow; # moves dup
+licate file to trash with timestamp extension.
#if( ! unlink($pathfile){print OUTfail "$pathfile\n"; $failCNT+
++;}
if($remove){if( ! rename $pathfile,$trashFile){print OUTfail "$pa
+thfile\n"; $failCNT++;}}
$seen{"$md5:$filesize"} .= "\n $pathfile";
$delCNT++;
print " files: $fileCNT originals: $origCNT files to delete: $d
+elCNT failed: $failCNT \r";
}else{
$seen{"$md5:$filesize"} = "$pathfile";
printf OUT ("%32s\t%8d\t%s\t%s\n", $md5,$filesize,$pathfile,$file
+);
$origCNT++;
print " files: $fileCNT originals: $origCNT files to delete: $d
+elCNT failed: $failCNT \r";
}
}
foreach my $key (keys %seen){
print OUTdel " $seen{$key}\n";
}
print " files: $fileCNT originals: $origCNT files to delete: $delCNT
+ failed: $failCNT \n\n";
| [reply] [Watch: Dir/Any] [d/l] |
Re: Find duplicate files.
by grinder (Bishop) on Feb 26, 2001 at 19:56 UTC
|
Interesting. I wrote my own that does pretty much the same
thing, but in a different way (I only use one hash, so I
suspect it will use less memory (but see response below for
the final word)).
#! /usr/bin/perl -w
use strict;
use File::Find;
use Digest::MD5;
my %digest;
my $total_bytes = 0;
my $dups = 0;
sub wanted {
return unless -f $_;
my $bytes = -s _;
return unless $bytes;
if( !open IN, $_ ) {
print "Cannot open $_ for input: $!\n";
return;
}
my $md5 = Digest::MD5->new;
my $d = $md5->addfile( *IN )->digest;
close IN;
if( defined $digest{$d} ) {
print "$bytes\t$digest{$d}\t$File::Find::name\n";
$total_bytes += $bytes;
++$dups;
}
else {
$digest{$d} = $File::Find::name;
}
}
foreach my $d ( @ARGV ) {
print "=== directory $d\n";
find \&wanted, $d;
}
printf "Statistics:
Duplicates: %12d
Bytes: %12d
KBytes: %12d
MBytes: %12d
GBytes: %12d\n",
$dups,
$total_bytes,
$total_bytes / (1024**1),
$total_bytes / (1024**2),
$total_bytes / (1024**3);
It is very verbose, but that's because I pipe the output
into something that can be handed off to users in a
spreadsheet so that they can do their own housekeeping
(2Gb of duplicates in 45Go of files...).
BTW, you can also save a squidgin of memory by using
the digest() method, rather than the hexdigest() method,
since the value is not intended for human consumption. | [reply] [Watch: Dir/Any] [d/l] |
|
Yes, but there is a fundamental difference...
The first script will only do MD5 hashes on files if there is more than one file with the same file size, then compares the MD5s for the files of that size. Yours MD5's *everything*, then compares *all* the MD5s. If a file has a unique filesize, it *can't* have a duplicate.
Depending on the make up of the files, this can have a dramatic effect:
Files: 15272
Duplicates: 999
Bytes: 15073525
Results:
First script:
real 0m11.855s
user 0m2.590s
sys 0m1.640s
Second script:
real 0m49.589s
user 0m17.110s
sys 0m6.500s
The second script is four times slower than the first...
Admittedly, if all your files were the same size there would be no difference, but in most cases, the first script will win. But hey... | [reply] [Watch: Dir/Any] [d/l] [select] |
|
Me again (posted as Anonymous Monk last time)
As above, I drew a blank on google, and was amazed that even stuff written in C and posted on Freshmeat was stupidly slow at finding duplicate files. I knew I should have checked Perlmonks sooner :-)
Just to point out the differences in real-time speed depending on how much of how many files you read in in the comparison step, I coded up another version in a little (fully functional though) utility.
The code for script three is below
Benchmarks:
scenario 1: a bundle of MP3 files
number of files: 5969
Number of duplicates: 11
Total size of all files: 16,560,706,048 (~16 gigs)
(script one - original, MD5 hash files with same file size)
...
37,411,688 bytes in duplicated files
real 5m1.753s
user 1m48.760s
sys 0m31.010s
(script two - MD5 hash calculated on *all* files)
Duplicates: 11
Bytes: 37411688
real 17m1.262s
user 6m56.060s
sys 1m59.830s
fdupes (C program from freshmeat.net - uses same algorithm as script two)
real 6m25.131s
user 2m42.310s
sys 0m32.450s
(script three - see below, read up to first \n char for initial check, then read whole file in for full check. No MD5s calculated at all)
37,411,688 bytes in duplicated files
real 0m48.545s
user 0m2.150s
sys 0m1.460s
Yes, that *is* 48 seconds rather than 5 or 17 minutes. This is because script 3 reads the first line in as a comparison first - creating an MD5 hash requires that the whole file is read in.
Scenario 2: home directory
number of files: 15280
Number of duplicates: 998
Total size of all files: 677,698,560 (677 megs)
Script one results
15,073,517 bytes in duplicated files
real 0m9.745s
user 0m2.610s
sys 0m1.220s
Script two results
Duplicates: 998
Bytes: 15073517
real 0m51.197s
user 0m17.520s
sys 0m6.700s
fdupes (C program from freshmeat.net - uses same algorithm as script two)
real 0m18.332s
user 0m8.080s
sys 0m5.270s
Script three results
15,069,331 bytes in duplicated files
real 0m12.924s
user 0m3.110s
sys 0m2.510s
(Note less duplicates found by script three as it skips all the small files < 100 bytes)
The third script is slower than the first in this situation as it must do multiple compares (ie a with b, a with c, a with d) rather than using the MD5 hashing technique It would be even slower if we counted small files (timed at around 23 seconds). Both 1 and 3 are still *much* faster than 2 though. The fdupes benchmarks are just in there for comparison to show how a bad algorithm can slow down a fast language.
Also note that not using MD5 hashes means I suffer if there are three or more identical, large, files, but I wanted to be *absolutely* sure not to get any false positives and MD5 hashing doesn't (quite) do that. So I do a byte-for-byte comparison between possible pairs.
There is almost certainally another way - we could do two passes using the MD5 technique, creating MD5 hashes for the first (say) 200 bytes of each file in the first pass, then MD5-ing on the whole file if the first ones match. This should give us good performance on both large numbers of duplicated small files *and* small numbers of duplicates of large files. But that's something for another day, and I somehow *prefer* to do byte-by-byte checks. Paranoia, I guess.
Anyway - here's the code...
fdupes.pl (usage: fdupes.pl <start dir>):
#!/usr/bin/perl -w
# Usage: ./fdupes.pl <start directory>
use strict;
use Term::ReadKey;
use File::Find;
# testing - 0 for interactive mode, 1 to skip all deletion etc
my $testing = 0;
# skip files smaller than 100 bytes. Set to zero if you like...
my $minsize = 100;
my $filecount = my $bytecount = my $fileschecked = my $wasted = 0;
my %files = ();
&usage unless (@ARGV);
sub wanted {
return unless -f;
my $filesize = (stat($_))[7];
$bytecount += $filesize;
return unless $filesize > $minsize; # skip small files
$filecount++;
push @{$files{$filesize}}, $File::Find::name;
}
find(\&wanted, $ARGV[0] || ".");
# update progress display 1000 times maximum
my $update_period = int($filecount/1000)+1;
if ($fileschecked % $update_period == 0) {
print "Progress: $fileschecked/$filecount\r";
# note \r does carriage return, but NO LINE FEED
# for progress display
}
my @dupesets;
# list of lists - @{$dupesets[0]} = (file1, file2)
# where file1 and file2 are dupes
foreach my $size (keys %files) {
my @entries = @{$files{$size}};
my $samesizecount = scalar @entries;
if (@{$files{$size}} == 1) { # unique size
$fileschecked++;
next;
}
# duplicates by file size.. Check if files are the same
while (my $base = shift @entries) {
# get first entry in list under filesize
my @dupes = ();
my $count = 0;
while ($count <= $#entries) {
# go through all @entries
my $compare = $entries[$count];
if (&same($base, $compare)) {
# remove "compare" from list so it can't be used
# on next run
splice(@entries, $count,1);
# removed "compare" from list - update progress
if (++$fileschecked % $update_period == 0) {
print "Progress: $fileschecked/$filecount\r";
}
if (@dupes) {
# already have some dupes - just add duplicate
# #n to list
push @dupes, $compare;
$wasted += $size;
} else {
# no dupes yet - include base file and duplicate
# #1 in list
push @dupes, ($base, $compare);
$wasted += $size;
}
} else {
$count++;
# only increase counter if not a dupe - note splice
# will break $array[$position] loop otherwise
}
}
if (@dupes) {
push @dupesets, \@dupes;
}
# "base" file removed from list of files to check - update
# progress meter
if (++$fileschecked % $update_period == 0) {
print "Progress: $fileschecked/$filecount\r";
}
}
}
if (@dupesets) {
my @deletelist = ();
# at least one set of duplicates exists
# number of sets of duplicates
my $dupesetcount = scalar(@dupesets);
my $dupesetcounter = 0;
foreach my $setref (@dupesets) {
if ($testing) {
print @$setref, "\n";
next;
}
$dupesetcounter++;
my @dupes = @$setref;
print "Duplicates found ($dupesetcounter / $dupesetcount)",
"... Should I keep...\n";
my $count = 0;
# print up list of options of which file to keep
while ($count <= $#dupes) { # go through all @entries
my $entry = $dupes[$count];
print $count + 1, " : $entry\n";
$count++;
}
# alternative options - keep all files, skip to end
print "0: All\n";
print "A: Skip all remaining duplicates\n";
# use ReadKey to get user input
ReadMode 4; # Turn off controls keys
my $key = '';
while (not defined ($key = ReadKey(-1))) {
# No key yet
}
ReadMode 0; # Reset tty mode before exiting
if ($key eq 'A') {
# skip any remaining dupes and get to deletion bit
last;
}
# not a number or 'A' - default to zero (ie keep all files)
$key = '0' unless ($key =~ /^\d+$/);
if ($key == 0) { # ALL - don't delete anything
#print "you chose: ALL\n";
} elsif (defined $dupes[$key-1]) {
print "you chose: ", $dupes[$key-1], "\n";
my @list_to_delete = @dupes;
# remove file to keep from list
splice(@list_to_delete, $key-1, 1);
# add rest to deletelist
push @deletelist, @list_to_delete;
} else {
#print "you chose: invalid number... (nothing will",
# " be deleted)\n";
}
print "\n";
}
# confirm deletion if any files are needing deleting
if (@deletelist) {
print "\n------------------------\n";
print "list of files to delete:\n";
foreach (@deletelist) {
print "$_\n";
}
print "\nAre you *sure* you want to delete all these files?",
" (Y/N)\n";
ReadMode 4; # Turn off controls keys
my $key = '';
while (not defined ($key = ReadKey(-1))) {
# No key yet
}
ReadMode 0; # Reset tty mode before exiting
if (lc($key) eq 'y') {
print "deleting\n";
unlink @deletelist;
} else {
print "wussing out\n";
}
}
1 while $wasted =~ s/^([-+]?\d+)(\d{3})/$1,$2/;
print "$wasted bytes in duplicated files\n";
}
# routine to check equivalence in files. pass 1 checks first
# "line" of file (up to \n char), rest of file checked if 1st
# line matches
sub same {
local($a, $b) = @_;
open(A, $a) || die;
open(B, $b) || die;
if (<A> ne <B>) { # FIRST LINE is not the same
return 0; # not duplicates
} else { # try WHOLE FILE
local $/ = undef;
return <A> eq <B>;
}
}
sub usage {
print "Usage: $0 <start directory>\n";
exit;
}
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
|
|