#!/usr/bin/perl # Usage: ./fdupes.pl use strict; use warnings; use Term::ReadKey; use File::Find; # testing - 0 for interactive mode, 1 to skip all deletion etc my $testing = 0; # skip files smaller than 100 bytes. Set to zero if you like... my $minsize = 100; my $filecount = my $bytecount = my $fileschecked = my $wasted = 0; my %files = (); &usage unless (@ARGV); my $searchdir = $ARGV[0]; my $autodelete; if ( $ARGV[0] eq '-d' ) { $searchdir = $ARGV[1]; $autodelete = "yes"; } sub wanted { return unless -f; return if -l; my $filesize = ( stat($_) )[7]; $bytecount += $filesize; return unless $filesize > $minsize; # skip small files $filecount++; push @{ $files{$filesize} }, $File::Find::name; } find( \&wanted, $searchdir || "." ); # update progress display 1000 times maximum my $update_period = int( $filecount / 1000 ) + 1; if ( $fileschecked % $update_period == 0 ) { print "Progress: $fileschecked/$filecount\r"; # note \r does carriage return, but NO LINE FEED # for progress display } my @dupesets; # list of lists - @{$dupesets[0]} = (file1, file2) # where file1 and file2 are dupes foreach my $size ( keys %files ) { my @entries = @{ $files{$size} }; my $samesizecount = scalar @entries; if ( @{ $files{$size} } == 1 ) { # unique size $fileschecked++; next; } # duplicates by file size.. Check if files are the same while ( my $base = shift @entries ) { # get first entry in list under filesize my @dupes = (); my $count = 0; while ( $count <= $#entries ) { # go through all @entries my $compare = $entries[$count]; if ( &same( $base, $compare ) ) { # remove "compare" from list so it can't be used # on next run splice( @entries, $count, 1 ); # removed "compare" from list - update progress if ( ++$fileschecked % $update_period == 0 ) { print "Progress: $fileschecked/$filecount\r"; } if (@dupes) { # already have some dupes - just add duplicate # #n to list push @dupes, $compare; $wasted += $size; } else { # no dupes yet - include base file and duplicate # #1 in list push @dupes, ( $base, $compare ); $wasted += $size; } } else { $count++; # only increase counter if not a dupe - note splice # will break $array[$position] loop otherwise } } if (@dupes) { push @dupesets, \@dupes; } # "base" file removed from list of files to check - update # progress meter if ( ++$fileschecked % $update_period == 0 ) { print "Progress: $fileschecked/$filecount\r"; } } } if (@dupesets) { my @deletelist = (); # at least one set of duplicates exists # number of sets of duplicates my $dupesetcount = scalar(@dupesets); my $dupesetcounter = 0; if ($autodelete) { foreach my $setref (@dupesets) { my $firstdupe=shift @$setref; push @deletelist, @$setref ; } } else { foreach my $setref (@dupesets) { if ($testing) { print @$setref, "\n"; next; } $dupesetcounter++; my @dupes = @$setref; print "Duplicates found ($dupesetcounter / $dupesetcount)", "... Should I keep...\n"; my $count = 0; # print up list of options of which file to keep while ( $count <= $#dupes ) { # go through all @entries my $entry = $dupes[$count]; print $count + 1, " : $entry\n"; $count++; } # alternative options - keep all files, skip to end print "0: All\n"; print "A: Skip all remaining duplicates\n"; # use ReadKey to get user input ReadMode 4; # Turn off controls keys my $key = ''; while ( not defined( $key = ReadKey(-1) ) ) { # No key yet } ReadMode 0; # Reset tty mode before exiting if ( $key eq 'A' ) { # skip any remaining dupes and get to deletion bit last; } # not a number or 'A' - default to zero (ie keep all files) $key = '0' unless ( $key =~ /^\d+$/ ); if ( $key == 0 ) { # ALL - don't delete anything #print "you chose: ALL\n"; } elsif ( defined $dupes[ $key - 1 ] ) { print "you chose: ", $dupes[ $key - 1 ], "\n"; my @list_to_delete = @dupes; # remove file to keep from list splice( @list_to_delete, $key - 1, 1 ); # add rest to deletelist push @deletelist, @list_to_delete; } else { #print "you chose: invalid number... (nothing will", # " be deleted)\n"; } print "\n"; } } # confirm deletion if any files are needing deleting if (@deletelist) { print "\n------------------------\n"; print "list of files to delete:\n"; foreach (@deletelist) { print "$_\n"; } print "\nAre you *sure* you want to delete all these files?", " (Y/N)\n"; ReadMode 4; # Turn off controls keys my $key = ''; while ( not defined( $key = ReadKey(-1) ) ) { # No key yet } ReadMode 0; # Reset tty mode before exiting if ( lc($key) eq 'y' ) { print "deleting\n"; unlink @deletelist; } else { print "wussing out\n"; } } 1 while $wasted =~ s/^([-+]?\d+)(\d{3})/$1,$2/; print "$wasted bytes in duplicated files\n"; } # routine to check equivalence in files. pass 1 checks first # "line" of file (up to \n char), rest of file checked if 1st # line matches sub same { local ( $a, $b ) = @_; open( A, $a ) || die; open( B, $b ) || die; if ( ne ) { # FIRST LINE is not the same return 0; # not duplicates } else { # try WHOLE FILE local $/ = undef; return eq ; } } sub usage { print "Usage: $0 [-d] \n"; print " -d : autodelete, keeps first\n"; exit; }