I have seen some threads around perlmonks regarding finding duplicate files.
I noticed that my idea was similar with a post by superfrink
a while ago in 2006.
You can find out more about how to use it here
So I used DBI,YAML,File::Find and SQLite and Digest::SHA1 to write a project that does this.
You can find more about it .
I have done tests and it works ok for 100.000 files and 2gb of total file size.But because of how
it was built I am sure it can scale to much more.
Any suggestions/oppinions are welcomed.
setup.pl
#this script will create the database which will be used
#CAREFUL! IF THERE EXISTS A PREVIOUS DATABASE IT WILL DELETE IT !!!
use strict;
use warnings;
use DBI;
`rm checksum_db.sqlite`;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
$dbh->do("CREATE TABLE checksums (id INTEGER PRIMARY KEY,checksum VARC
+HAR(42),size INTEGER,last_date_modified DATE,name VARCHAR(200) UNIQUE
+,is_dir VARCHAR(1),is_file VARCHAR(1),is_link VARCHAR(1),UNIQUE (chec
+ksum,name));");
config.yml
minsize: 64
directories:
- path: /usr
dir: 1
file: 1
link: 0
regex: .*
- path: /home/spx2/perlhobby
dir: 1
file: 1
link: 0
regex: .*
- path: /lib
dir: 1
file: 1
link: 0
regex: .*
build_database.pl
#this will be used just for the first run
#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
my $config_path = 'config.yml';
my $config = LoadFile($config_path);
#to add columns in db for link,dir,file to know what the name column s
+tands for...
sub add_to_db {
my ($checksum,$last_modif_time,$size,$name)=@_;
#maybe calculating is_* should be done in process_file
my $is_dir = (-d $name)?'Y':'N';
my $is_file = (-f $name)?'Y':'N';
my $is_link = (-l $name)?'Y':'N';
$dbh->do(
sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
$checksum,
$size,
$last_modif_time->ymd,
$name,
$is_dir,
$is_file,
$is_link
);
};
sub delete_from_db {#remains to be completed
my ($name)=@_;
};
sub file2sha1 {
my $file=$_[0];
return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
open my $f,"<$file";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*$f);
return $sha1->hexdigest;
}
sub process_file {
my $dir_configs=$_[0];
##optimisation using -d -l -f -s just once for return and also for
+ adding
#if current "file"(unix terminology) is a directory and the yaml c
+onfiguration
#tells us to eliminate directories from the search we do so by ret
+urning from the
#callback
return if -d $File::Find::name && ! $dir_configs->{dir};
return if -l $File::Find::name && ! $dir_configs->{link};
return if -f $File::Find::name && ! $dir_configs->{file};
return if -s $File::Find::name < $config->{minsize};
unless($File::Find::name =~ /$dir_configs->{regex}/) {
if(-d $File::Find::name) {
$File::Find::prune=1;
}
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat($File::Find::name);
my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
# printf "%s %s %s %s\n",
# $File::Find::name,
# file2sha1($File::Find::name),
# -s $File::Find::name,
# $last_modif_time;
add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::
+Find::name,$File::Find::name);
#print Dumper $dir_configs;
};
for my $searched_dir_hash (@{ $config->{directories} }) {
# we skip the entry if it does not exist or it is not a directory
next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path});
#we pass to the process_file function the yml configuration for th
+e current directory that is searched
find(
{ wanted=> sub { process_file($searched_dir_hash);} },
$searched_dir_hash->{path}
);
}
update_database.pl
#!/usr/bin/perl
use strict;
use warnings;
use File::Find;
use YAML qw/LoadFile/;
use Data::Dumper;
use Digest::SHA1 qw/sha1_hex/;
use DBI;
use DateTime;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
my $config_path = 'config.yml';
my $config = LoadFile($config_path);
sub add_to_db {
my ($checksum,$last_modif_time,$size,$name)=@_;
#maybe calculating is_* should be done in process_file
my $is_dir = (-d $name)?'Y':'N';
my $is_file = (-f $name)?'Y':'N';
my $is_link = (-l $name)?'Y':'N';
$dbh->do(
sprintf "INSERT INTO checksums (checksum,size,last_date_modifi
+ed,name,is_dir,is_file,is_link) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\
+"%s\",\"%s\",\"%s\");",
$checksum,
$size,
$last_modif_time->ymd,
$name,
$is_dir,
$is_file,
$is_link
);
}
sub update {
my ($name,$checksum,$last_modif_time)=@_;
$dbh->do(sprintf "UPDATE checksums SET checksum=\"%s\",last_date_m
+odified=\"%s\" WHERE name=\"%s\";",$checksum,$name,$last_modif_time->
+ymd);
}
sub find_or_update {
my ($name,$last_modif_time)=@_;
my $s=$dbh->prepare(sprintf "SELECT last_date_modified FROM checks
+ums WHERE name=\"%s\" ;",$name);
$s->execute;
my $results = $s->fetchall_arrayref;
if($results) {
#found it in the db;
return 2 if $last_modif_time->ymd eq $results->[0]->[0] ; #ret
+urn 2 if the entry is up to date
update($name,file2sha1($name),$last_modif_time);
return 1;# the entry is not up to date
}
return 0; #the entry has not be found- should be updated
};
sub file2sha1 {
my $file=$_[0];
return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
open my $f,"<$file";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*$f);
return $sha1->hexdigest;
}
sub process_file {
my $dir_configs=$_[0];
return if -d $File::Find::name && ! $dir_configs->{dir};
return if -l $File::Find::name && ! $dir_configs->{link};
return if -f $File::Find::name && ! $dir_configs->{file};
return if -s $File::Find::name < $config->{minsize};
unless($File::Find::name =~ /$dir_configs->{regex}/) {
if(-d $File::Find::name) {
$File::Find::prune=1;
}
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat($File::Find::name);
my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
#find out if entry needs update and update it if necesary
#find_or_update returns 0 only if it hasnt found the file in the c
+hecksum db
unless(find_or_update($File::Find::name,$last_modif_time)) {
add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $Fi
+le::Find::name,$File::Find::name);
#add it to db
};
};
for my $searched_dir_hash (@{ $config->{directories} }) {
next unless (-e $searched_dir_hash->{path} && -d $searched_dir_has
+h->{path});
find(
{ wanted=> sub { process_file($searched_dir_hash);} },
$searched_dir_hash->{path}
);
}
build_duplicate_script.pl
#!/usr/bin/perl
use strict;
use warnings;
use DBI;
use DateTime;
use Data::Dumper;
my $dbh = DBI->connect("dbi:SQLite:dbname=checksum_db.sqlite","","");
open my $script,">duplicate_erase_script.sh";
sub get_unique_checksums {
my $sql="SELECT checksum as groupsize FROM checksums GROUP BY size
+ HAVING groupsize > 1;";
#because groups of size 1 cannot have duplicates
my $sth=$dbh->prepare($sql);
$sth->execute;
my $results=$sth->fetchall_arrayref;
return map { $_->[0] } @{$results};
};
sub checksum2names {
my ($checksum)=@_;
my $sql=sprintf "SELECT name FROM checksums WHERE checksum=\"%s\";
+",$checksum;
my $sth=$dbh->prepare($sql);
$sth->execute;
my $results=$sth->fetchall_arrayref;
return map { $_->[0] } @{$results};
};
for my $checksum (get_unique_checksums()) {
my @same_checksum=checksum2names($checksum);
my $leader = shift @same_checksum;#take aside on element of the gr
+oup making it the leader
print $script "# duplicates of $leader follow:\n";
for my $name (@same_checksum) {#get all the others and write comma
+nds to delete them
print $script "# rm $name\n";
}
};
close $script;
EDIT: The latest version of this can be found (it has heavily mutated due to suggestions of the monks).
Re: scalable duplicate file remover
by dragonchild (Archbishop) on Mar 02, 2008 at 16:43 UTC
|
Without reading too deeply, DBM::Deep might have been easier for the existence checks.
My criteria for good software:
- Does it work?
- Can someone else come in, make a change, and be reasonably certain no bugs were introduced?
| [reply] [Watch: Dir/Any] |
Re: scalable duplicate file remover
by jwkrahn (Abbot) on Mar 03, 2008 at 02:47 UTC
|
sub file2sha1 {
my $file=$_[0];
return '' if -d $file; #have to find out if to prune when a directory is found that doesn't match the regex
open my $f,"<$file";
my $sha1 = Digest::SHA1->new;
$sha1->addfile(*$f);
return $sha1->hexdigest;
}
- You should open the file in "binary" mode to work correctly.
- You should verify that the file opened correctly.
- *$f makes no sense because $f is a lexical variable that contains a reference to a filehandle.
- You should probably use $sha1->digest instead which is half the size of $sha1->hexdigest.
sub file2sha1 {
my $file = $_[ 0 ];
return '' if -d $file; #have to find out if to prune when a direct
+ory is found that doesn't match the regex
open my $f, '<:raw', $file or do {
warn "Cannot open '$file' $!";
return;
};
my $sha1 = Digest::SHA1->new;
$sha1->addfile( $f );
return $sha1->digest;
}
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
First of all thank you very much for the critique,it is very well welcomed from my part.
I will use it to improve the program.
1)why do you think the current method of opening the files does not yield correct results ?
(I compared my results of SHA1s against sha1sum unix utilitary and they came out ok,that's
why I'm asking).
2)you are right,I will do this
3)ok I understand,where could I read more about this ?
4)As I read the documentation and thinking that a number in base 10 should always present more
digits than its representation in base 16 I dont understand how it could be shorter in base 10.
I don't get why they say I will get a shorter string in a lower base.
Also they talk about using a single sha1 object and reusing it because of the reset() method that
can clear out the old data from it.
Do you think this will speed up things ?
| [reply] [Watch: Dir/Any] |
|
- From the documentation for Digest::SHA1:
$sha1->addfile($io_handle)
[ SNIP ]
In most cases you want to make sure that the $io_handle is in "binmode" before you pass it as argument to the addfile() method.
- OK. :-)
- Typeglobs and Filehandles
How do I pass filehandles between subroutines?
How can I use a filehandle indirectly?
- $sha1->digest returns a digest in binary form while $sha1->hexdigest is in hexadecimal form. For example:
$ perl -le'
my $digest = "\x02\x07\xFA\x78";
my $hex_digest = "0207FA78";
print for length( $digest ), length( $hex_digest );
'
4
8
- Update: reset() may or may not speed things up. You would have to compare both methods with Benchmark to be sure.
| [reply] [Watch: Dir/Any] [d/l] [select] |
Re: scalable duplicate file remover
by jwkrahn (Abbot) on Mar 03, 2008 at 08:20 UTC
|
sub process_file {
my $dir_configs=$_[0];
##optimisation using -d -l -f -s just once for return and also for adding
#if current "file"(unix terminology) is a directory and the yaml configuration
#tells us to eliminate directories from the search we do so by returning from the
#callback
return if -d $File::Find::name && ! $dir_configs->{dir};
You call stat on the file.
return if -l $File::Find::name && ! $dir_configs->{link};
You call lstat on the same file.
return if -f $File::Find::name && ! $dir_configs->{file};
You call stat on the same file again.
return if -s $File::Find::name < $config->{minsize};
You call stat on the same file again.
unless($File::Find::name =~ /$dir_configs->{regex}/) {
if(-d $File::Find::name) {
You call stat on the same file again.
$File::Find::prune=1;
}
return;
}
my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
$atime,$mtime,$ctime,$blksize,$blocks)
= stat($File::Find::name);
You call stat on the same file again. You declare 13 variables but you are only using one.
my $last_modif_time=DateTime->from_epoch(epoch=>$mtime);
# printf "%s %s %s %s\n",
# $File::Find::name,
# file2sha1($File::Find::name),
# -s $File::Find::name,
Commented out but if not you call stat on the same file again.
# $last_modif_time;
add_to_db(file2sha1($File::Find::name),$last_modif_time,-s $File::Find::name,$File::Find::name);
You call stat on the same file again. You call add_to_db() which calls stat or lstat three more times.
#print Dumper $dir_configs;
};
In total you call stat or lstat ten times on the same file (eleven times if you uncomment the printf statement.) You also use $File::Find::name in most places where $_ would have the same effect.
sub process_file {
my $dir_configs = $_[ 0 ];
##optimisation using -d -l -f -s just once for return and also for
+ adding
#if current "file"(unix terminology) is a directory and the yaml c
+onfiguration
#tells us to eliminate directories from the search we do so by ret
+urning from the
#callback
return if -l && ! $dir_configs->{ link }; # call lstat on current
+file to test for symlink
my ( $size, $mtime ) = ( stat )[ 7, 9 ];
return if -d _ && ! $dir_configs->{ dir };
return if -f _ && ! $dir_configs->{ file };
return if $size < $config->{ minsize };
unless ( $File::Find::name =~ /$dir_configs->{regex}/ ) {
if ( -d _ ) {
$File::Find::prune = 1;
}
return;
}
my $last_modif_time = DateTime->from_epoch( epoch => $mtime );
# print "$File::Find::name ", file2sha1( $_ ), " $size $last_modif_
+time\n",
add_to_db( file2sha1( $_ ), $last_modif_time, $size, $File::Find::
+name );
#print Dumper $dir_configs;
}
| [reply] [Watch: Dir/Any] [d/l] [select] |
|
Thank you very much for the optimisation proposed.
As I read from -X operators documentation I saw they
were using " _ " as you are using also.
Couldn't just one call to stat be performed and get all the information from there(including
the information provided by the -X operators) ?
| [reply] [Watch: Dir/Any] |
|
Yes, and in fact that is what I have shown in the example code I posted above.
| [reply] [Watch: Dir/Any] |
Re: scalable duplicate file remover
by reneeb (Chaplain) on Mar 03, 2008 at 10:06 UTC
|
In your setup.pl:
> `rm checksum_db.sqlite`;
You should use unlink "checksum_db.sqlite" to be platform independent! | [reply] [Watch: Dir/Any] |
Re: scalable duplicate file remover
by alpha (Scribe) on Mar 24, 2008 at 15:04 UTC
|
I must admit that this could've been written in a LOT less of code, using globs | [reply] [Watch: Dir/Any] |
|
|