http://www.perlmonks.org?node_id=224748
Category: Utility Scripts
Author/Contact Info Antonio Bellezza
Description:

A script to find and remove duplicate files in one or more directory. It serves the same purpose as salvadors's module (see also File::Find::Duplicates), but it's more efficient when discriminating different files of the same size.*

The program gets a speed-up by reducing file reads to a minimum. In most cases, it only reads small chunks from unique files and only files with duplicates are read completely. Thus, it is particularly fit for big collections of audio files or images (shameless advertisement ;).*


* - added and revised explanation, inspired by merlyn's comment.

The rationale behind this program is: verifying whether two files differ doesn't require reading them completely. The program first groups the files by size, then iteratively subdivides the groups according to some samples of the files. Sample size and distribution increase and identical files are read completely (the only way to prove they are really identical).

Update: It is safe to remove use of Digest::MD5 and sub hash, which could be used to experiment with different hashing methods but are not used by the program in its present form.

Update: this has evolved to a more complex program.

#!/usr/bin/perl -w

#------------------------------------------------------------
# dupfinder: find duplicate files
#------------------------------------------------------------
# Copyright Antonio Bellezza 2003
# mail: antonio@beautylabs.net
#------------------------------------------------------------
# This program is free software; you can redistribute it and/or modify
# it under the terms of version 2 of the GNU General Public License
# as published by the Free Software Foundation;
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#------------------------------------------------------------

use strict;

# Show program usage if no arguments are given
if ($#ARGV == -1) {
    usage();
    exit 0;
}

my $finder = Finder -> new( @ARGV );

my @group = $finder -> findDuplicates();

# Result printout and elaboration
for my $group ( @group ) {
    print "Possible duplicates of size $group->[0]{size}:\n";
    for (1..$#$group) {
    print "[$_] $group->[$_]\n";
    }
    print "\nACTION: [] continue  [1-$#$group] keep corresponding file
+ and remove the rest\n";
    my $input = <STDIN>;
    chomp $input;
    next if $input eq '';
    if ($input =~ m|([0-9]+)| && $1 > 0 && $1 <= $#$group) {

    for (0..($1-1), ($1+1)..$#$group) {
        my $delendum = $group -> [$_];
        print "Unlinking $delendum: ";
        unlink $delendum;
        print "done\n";
    }
    }
    print "\n";
}




#----------------------------------------
# usage()
# Show usage
#----------------------------------------
sub usage {
    print "Usage: dupfinder dir ...\n",
    "Find and interactively remove duplicate files\n",
}



package Finder;

#------------------------------------------------------------
# A finder is implemented as a hash
# $finder -> {groups} is the array of groups of possibly
# equal files
# Each group is an array whose first element is a hash
# with the various key attributes.
# Subsequent elements are the filenames in the group
# Example:
# [
#   [ { size=>0 }, 'empty.txt', 'null.dat', 'nothing_here' ],
#   [ { size=>1321, hash12=>'xyz' }, 'myfile.a', 'myfile.b' ],
#   [ { size=>1321, hash12=>'wtt' }, 'myfile.c', 'myfile.d' ]
# ]
#------------------------------------------------------------

use strict;
use IO::File;
use File::Find;
use Digest::MD5 qw( md5 );

use constant MINREADSIZE => 1024;
use constant MAXREADSIZE => 1024 * 1024;
use constant BLOCK   => 4096;

our $VERSION = '1.0';

our $handles = {};


#----------------------------------------
# new ( dir, ... )
# Create new finder
#----------------------------------------
sub new {
    my $class = shift;
    my $self = {
    dirs     => [ @_ ],
    groups   => [],
    terminal => []
    };
    return bless $self, $class;
}

#----------------------------------------
# readDirs ()
# Find all files and setup finder
#----------------------------------------
sub readDirs {
    my $self = shift;
    my $group = [ {} ];
    find( sub { -f && ( push @$group, $File::Find::name ) },
      @{$self->{dirs}} );
    $self -> {groups} = [ $group ];
}




#----------------------------------------
# findDuplicates()
# Return list of terminal groups
#----------------------------------------
sub findDuplicates {
    my $self = shift;
    my $hasher;
    $self -> readDirs();

#    print $self -> status;

    $hasher = { process  => \&size,
        name     => 'size',
        terminal => sub { shift==0 } };

    $self -> {groups} = [ $self -> partition( $self -> {groups} [0], $
+hasher ) ];

#    print $self -> status;

    $self -> prune();

#    print $self -> status;

    
    for ( @{$self -> {groups}} ) {
    my @processList = ( $_ );
    my $size = $_ -> [0] {size};
    my $iterator = Finder::Looper -> new( $size );

    while ( @processList and my ( $start, $length ) = $iterator -> nex
+t() ) {
        $hasher = { process => \&sample,
            args    => [ $start, $length ] };
        my @newList = ();
        for (@processList) {
        my @subgroup = ( $self -> partition( $_, $hasher ) );
        $self -> prune( \@subgroup );
        push @newList, @subgroup;
        }
        @processList = @newList;
    }
    closeHandles( @processList );
    $self -> addTerminal( @processList );
    }

    return @{ $self -> {terminal} };
}



#----------------------------------------
# prune ()
# prune ( \@group )
# Remove groups only containing one file
# If argument is omitted, remove from $self -> {groups}
# Add to terminal groups with terminal key
# Return number of remaining groups
#----------------------------------------
sub prune {
    my $self = shift;
    my $src = $_[0] || $self -> {groups};
    my $counter = 0;
    for ( my $i = $#$src; $i>=0; $i--) {
    my $group = $src -> [$i];
    if ( $group -> [0] {terminal} ) {
        # Remove and add to terminal groups
        $self -> addTerminal( $group );
        closeHandles( $group );
        splice @$src, $i, 1;
    } elsif ( $#$group > 1 ) {
        # Keep in place
        $counter ++;
    } else {
        # Drop group only containing one file
        closeHandles( $group );
        splice @$src, $i, 1;
    }
    }
    return $counter;
}


#----------------------------------------
# partition( $group, hasher [, hasher par, ... ] )
# Execute a discriminatory step and create subgroups
# Return list of groups
# A hasher is a hash ref of type
# {
#    process  => sub { taking fileName as first arg, key as second arg
+ument },
#    name     => hash-key name / undef if not added,
#    terminal => sub { shift is a terminal key or not },
#    args     => [ extra arguments to pass ]
# }
#----------------------------------------
sub partition {
    my $self = shift;
    my ($group, $hasher, @hasherPar) = @_;

    my $key = shift @{$group};
    my %bucket = ();

    for (@{$group}) {
    my $hash = $hasher -> {process} -> ($_, $key,
                        @{ $hasher -> {args} || [] },
                        @hasherPar);
    push @{ $bucket {$hash} ||= [] }, $_;
    }

    my @result = ();

    for (keys %bucket) {
    # Create a clone of the key
    my $newKey = { %$key };
    $newKey -> { $hasher -> {name} } = $_ if $hasher -> {name};
    $newKey -> {terminal} = 1
        if ( $hasher -> {terminal} && $hasher -> {terminal} -> ($_) );
    push @result, [ $newKey, @{$bucket {$_}} ];
    }

    return @result;
}


#----------------------------------------
# status()
# Return string showing finder status
#----------------------------------------
sub status {
    my $self = shift;
    my $res = 'Groups:';
    for (grep {$_ > 0} map {$#$_} @{$self -> {groups}}) {

    $res .= " $_";
    }

    $res .= "\nTerminal:";
    for (grep {$_ > 0} map {$#$_} @{$self -> {terminal}}) {
    $res .= " $_";
    }
    $res .= "\n";
    return $res;
}


#----------------------------------------
# function
#----------------------------------------
# fileHandle( filename )
# return fileHandle or undef
#----------------------------------------
sub fileHandle {
    my ($fileName) = @_;
    unless ($handles -> {$fileName}) {
    my $handle = IO::File -> new();
    $handle -> open("<$fileName") || return undef;
    $handles -> {$fileName} = $handle;
    }
    return $handles -> {$fileName};
}

#----------------------------------------
# function
#----------------------------------------
# closeHandle( filename )
# close handle
#----------------------------------------
sub closeHandle {
    my ($fname) = @_;
    delete $handles -> {$fname};
}

#----------------------------------------
# function
#----------------------------------------
# closeHandles( $group, ... )
# Close handles of filenames contained in group
#----------------------------------------
sub closeHandles {
    for my $group (@_) {
    for (1..$#$group) {
        closeHandle( $group -> [$_] );
    }
    }
}

#----------------------------------------
# addTerminal( \@file, ... )
# Add to terminal sets arrays of files with given size
#----------------------------------------
sub addTerminal {
    my $self = shift;
    push @{ $self -> {terminal} }, @_;
}

{
    my $error = 0;
#----------------------------------------
# hash ( filename, key [, start [, length ]] )
# Apply MD5 to file segment
#----------------------------------------
sub hash {
    my ($fname, $key, $start, $length) = @_;
    $start ||= 0;

    my $res;

    # Return a consecutive error code if unable to open file
    my $handle = openHandle( $fname ) || return "Error " . $error++;
    $handle -> seek( $start, 0 );

    if ($length) {
    $handle -> read( $res, $length );
    $res = md5( $res );
    } else {
    my $md5 = Digest::MD5->new();
    while ( $handle -> read( $res, BLOCK ) ) {
        $md5->add( $res );
    }
    $res = $md5->digest();
    }

    return $res;
}


#----------------------------------------
# sample ( filename, key [, start [, length ] ] )
#----------------------------------------
sub sample {
    my ($fname, $key, $start, $length) = @_;
    $start ||= 0;

    my $res;

    # Return a consecutive error code if unable to open file
    my $handle = fileHandle( $fname ) || return "Error " . $error++;
    $handle -> seek( $start, 0 );

    if ($length) {
    $handle -> read( $res, $length );
    } else {
    $res = '';
    my $buffer;
    while ( $handle -> read( $buffer, BLOCK ) ) {
        $res .= $buffer;
    }
    }

    return $res;
}
}


#----------------------------------------
# size ( filename )
# Find file size
#----------------------------------------
sub size {
    my $fname = shift;
    return (stat ($_))[7];
}






#------------------------------------------------------------
# Finder::Looper
#------------------------------------------------------------
# Iterator providing starting points and lengths
# for interlaced reads
#------------------------------------------------------------
package Finder::Looper;

use constant MINREADSIZE => Finder::MINREADSIZE;
use constant MAXREADSIZE => Finder::MAXREADSIZE;
use constant BLOCK       => Finder::BLOCK;


#----------------------------------------
# new( size [, minsize [, maxsize ]] )
#----------------------------------------
sub new {
    my $class = shift;
    my ( $size, $minsize, $maxsize ) = @_;
    $minsize ||= MINREADSIZE;
    $maxsize ||= MAXREADSIZE;
    bless {
    size     => $size,
    minsize  => $minsize || MINREADSIZE,
    maxsize  => $maxsize || MAXREADSIZE,
    readsize => $minsize || MINREADSIZE,
    oldsize  => 0,
    i        => 0,
    gap      => 1 << nextLog2( $size )
    }, $class;
}

#----------------------------------------
# next()
# return ( start, length )
# return () if the iteration is over
#----------------------------------------

sub next {
    my $self = shift;

    # Return EOL if the gap has become smaller than the size
    # unless it's the first iteration ( oldsize = 0 )
    if ( $self -> {readsize} > $self -> {gap} && $self -> {oldsize} > 
+0 ) {
    return ();
    }
    
    if ( $self -> {i} * $self -> {gap} >= $self -> {size} ) {
    $self -> {i} = 0;
    $self -> {oldsize} = $self -> {readsize};
    $self -> {gap} >>= 1;
    $self -> {readsize} <<= 1
        if ( $self -> {readsize} < $self -> {gap}
         && $self -> {readsize} < $self -> {maxsize} );
    }

    my $offset = ( $self -> {i} % 2 ) ? 0 : $self -> {oldsize};
    
    my $start  = $self -> {i} * $self -> {gap} + $offset;
    my $length = $self -> {readsize} - $offset;
    $length    = $self -> {size} - $start if $start + $length > $self 
+-> {size};

    $self -> {i} ++;

    if ( $length <= 0 ) {
    return $self -> next();
    } else {
    return ( $start, $length );
    } 
}

#----------------------------------------
# function
#----------------------------------------
# nextLog2( positive integer )
# return exponent of nearest power of 2
# not less than integer
# Warning: returns at most the biggest power of
# two expressed by an integer
#----------------------------------------
sub nextLog2 {
    my $i = shift;
    my $pow = 1;
    my $exp = 0;
    while ( $pow < $i && $pow > 0 ) {
    $pow <<= 1;
    $exp++;
    }
    return $exp;
}



1;