Beefy Boxes and Bandwidth Generously Provided by pair Networks
The stupid question is the question not asked

Directory Tree Comparison Module (File::DiffTree)

by bikeNomad (Priest)
on Jun 23, 2001 at 00:23 UTC ( #90858=sourcecode: print w/replies, xml ) Need Help??
Category: Utility Scripts
Author/Contact Info bikeNomad Ned Konz,
Description: This is a package that I wrote after seeing some other scripts here that did similar things. This package allows the behavior on same/different files as well as comparison to be pluggable using CODE references. It may become a CPAN module if the response here is positive enough. An example program that uses it is at the end.
use strict;
package File::DiffTree;
# File::DiffTree -- Compare two directory hierarchies
# By Ned Konz,
# $Revision$

use vars '$VERSION';
use Algorithm::Diff 1.01 ();
use File::Find ();

    $File::DiffTree::VERSION = "0.1";

sub _findFiles
    my $topDir = shift;
    my $statFields = shift;
    my $reject = shift;

    my $topDirLength = length($topDir);
    my @files;
    File::Find::find( sub {
        my @stat = stat($_);
        if (!@stat) { warn "can't stat $File::Find::name : $!\n"; retu
+rn }
        return if -d _;
        my $fileName = substr($File::Find::name, $topDirLength);
        return if defined($reject) && &$reject($File::Find::name, @sta
        push(@files, [ $fileName, @stat[ @$statFields ] ]);
    }, $topDir);
    return \@files;

sub diffTree
    my $dirA = shift;
    my $dirB = shift;
    my $userOptions = shift || { };
    my $numberOfFields;
    my $numberOfSignificantFields;
    my $foldCase = sub { $_[0] };       # default no fold
    my %options = (
        onlya           => sub { },
        onlyb           => sub { },
        match           => sub { },
        statfields      => [ 7, 9 ],    # size, mtime
        hash            => sub {        # default=stringize them
            my $arr = shift;
            join($;, $foldCase->($arr->[0]),
                @{$arr}[ 1 .. $numberOfSignificantFields ])
        reject          => undef,
        significantfields => undef,
        foldcase        => 0,
        sort        => sub { $foldCase->($a->[0]) cmp $foldCase->($b->
+[0]) },
        # normalize the user's keys (DWIM)
        map { my $key = $_;
            $key =~ tr/A-Z_/a-z/d;
            $key, $userOptions->{$_}
        } keys(%$userOptions)

    $numberOfFields = scalar(@{$options{statfields}});
    $numberOfSignificantFields = defined($options{significantfields})
        ? $options{significantfields}
        : $numberOfFields;
    $numberOfSignificantFields = $numberOfFields
        if $numberOfSignificantFields > $numberOfFields;
    $foldCase = sub { lc($_[0]) } if $options{foldcase};

    my $filesA = _findFiles($dirA, $options{statfields}, $options{reje
    my $filesB = _findFiles($dirB, $options{statfields}, $options{reje

    # sort by name
    @$filesA = sort { $options{sort}->() } @$filesA;
    @$filesB = sort { $options{sort}->() } @$filesB;

        $filesA, $filesB, {
            MATCH       => sub {
                $options{match}->($filesA->[$_[0]], $filesB->[$_[1]]) 
            DISCARD_A   => sub { $options{onlya}->($filesA->[$_[0]]) }
            DISCARD_B   => sub { $options{onlyb}->($filesB->[$_[1]]) }
        }, $options{hash} );


=head1 NAME

File::DiffTree - Compare two directory hierarchies


  use File::DiffTree;

  File::DiffTree::diffTree($dirA, $dirB, {
    Match => sub { print $_[0]->[0], " matches\n" },
    # Fold_Case => 1,  # if on OS that doesn't care like windoze


C<File::DiffTree> compares the files in two directory hierarchies, cal
optional user-supplied callbacks for files in just one or the other di
as well as for files that match.

Matching is determined by matching the name (with optional case foldin
+g), as
well as zero or more of the numbers output by the C<stat()> call. You 
specify how many fields from C<stat> will be looked at for a match.

You can also specify how many fields from stat will be provided to you
callback routines. See OPTIONS below for the options to

=head1 OPTIONS

The third argument to File::DiffTree::diffTree is a hash reference
that can contain the following options. Option names may have undersco
+res or
capital letters as desired (that is, OnlyA, O_n_L_ya, Only_A, onlya, a
+nd only_a
are equivalent).

Since nothing by default is done for B<only_a>, B<only_b>, or B<match>
+, you
must provide at least one of these for any interesting behavior.

=over 4

=item B<only_a>

=item B<only_b>

The B<only_a> and B<only_b> options supply CODE references to user cal
routines that are called when a file appears in only one of the two di
trees, or exists in both but has different significant stat fields.

By default, nothing is done for these files.

The argument to these routines is an array that contains the filename 
to the starting directory, as well as whatever fields from stat were d
or specified with the B<stat_fields> option.

  File::DiffTree::diffTree( $dir1, $dir2, {
      only_a => sub { print "only in $dir1: ", $_[0]->[0], "\n" },
      only_b => sub { print "only in $dir2: ", $_[0]->[0], "\n" },

Of course, you can also specify a reference to a separate subroutine t
+hat you've

  File::DiffTree::diffTree( $dir1, $dir2, {
      only_a => \&onlyA,
      only_b => \&onlyB,

=item B<match>

The B<match> option supplies a CODE reference to a user callback routi
+ne that
is called when a file appears to match (based on name and significant 
from the stat call).

By default, nothing is done for these files.

The arguments to the B<match> routine are two arrays (one for each dir
that contain the filename relative to the starting directory, as well 
whatever fields from stat were defaulted or specified with the B<stat_

  File::DiffTree::diffTree( $dir1, $dir2, {
      match => sub { print "in both $dir1 and $dir2: ", $_[0]->[0], "\
+n" },

=item B<stat_fields>

The B<stat_fields> option specifies which fields from C<stat> will be
passed to the B<only_a>, B<only_b>, or B<match> user callbacks. This i
an ARRAY reference consisting of numbers from 0 through 12. By default
it is:

  stat_fields => [ 7, 9 ],

That is, the size and mtime (last modified time) of the files are pass
The possible field numbers are:

   0 dev      device number of filesystem
   1 ino      inode number
   2 mode     file mode  (type and permissions)
   3 nlink    number of (hard) links to the file
   4 uid      numeric user ID of file's owner
   5 gid      numeric group ID of file's owner
   6 rdev     the device identifier (special files only)
   7 size     total size of file, in bytes
   8 atime    last access time in seconds since the epoch
   9 mtime    last modify time in seconds since the epoch
  10 ctime    inode change time (NOT creation time!) in seconds since 
+the epoch
  11 blksize  preferred block size for file system I/O
  12 blocks   actual number of blocks allocated

If you want to compare only the name and size, but still have access t
+o the
modification time and inode, you can specify this using:

  File::DiffTree::diffTree( $dir1, $dir2, {
      match => sub { print "in both $dir1 and $dir2: ",
          $_[0]->[0], "\n" },
      stat_fields => [ 7, 9, 1 ], # size, mtime, inode
      significant_fields => 1,    # just size

Unless the B<significant_fields> option below is specified, all of the
B<stat_fields> will be considered when looking for a match. So by defa
file comparisons compare name, size, and modification time.

=item B<significant_fields>

The B<significant_fields> option is a number that specifies how many o
the fields from C<stat> will be considered when comparing files. By de
all of the fields will be compared. If you supply a 0 for B<significan
only the name will be compared.

This option is provided so that you can have separate control over how
+ many
fields from C<stat> you are passed and how many of those fields are co
by C<File::DiffTree>.

=item B<reject>

The B<reject> option is a CODE reference that can be provided to filte
+r files
that are unwanted. It is called from inside C<File::Find::find> with t
full filename and all the fields from C<stat> (this is 13 arguments).

Also, the C<$_> variable is set to the last component of the filename,
the current directory is the directory of the file, and the C<_> pseud
file handle can be tested.

If it returns true, the file will not be considered.

For instance, to ignore files that are unreadable or end in C<.bak>,
you can do this:

  File::DiffTree::diffTree( $dir1, $dir2, {
      match => sub { print "in both $dir1 and $dir2: ",
          $_[0]->[0], "\n" },
      reject => sub { /\.bak$/ || ! -r _ } 

=item B<fold_case>

If the B<fold_case> option is provided and is true, filenames will be
compared ignoring case differences. The filenames passed to the user
callbacks will have the actual case preserved.

This is probably what is wanted under Windows. You can do this for

    Fold_Case => ($^O eq 'Win32'),

=item B<sort>

The B<sort> option is a CODE reference that supplies an optional subro
that will be called when sorting the lists of files. It will have the 
+two arrays
to be compared passed in via the package variables C<$File::DiffTree::
+a> and
C<$File::DiffTree::b>. By default, sorting is by filename, with case f
if the B<fold_case> option is set.

You probably won't need this option. If you do, you may have to supply
+ the
B<hash> option as well.

=item B<hash>

The B<hash> option is a CODE reference that supplies an optional subro
that will be called to generate a key to determine uniqueness of the f
By default, this key will consist of the file name, and all the stat f
specified by the B<significant_fields> option, turned into strings and
separated by the C<$;> character (by default C<\034>). Specify the B<h
option if you need to do something different. The argument to this sub
is an array reference like those passed to the B<only_a> and B<match> 

You shouldn't need this option. If you do, you'll probably have to sup
the B<sort> option as well.


=head2 EXPORT

File::DiffTree doesn't export anything. Typing is good for you. Call d
as File::DiffTree::diffTree .

=head1 AUTHOR

By Ned Konz,

=head1 LICENSE

This module is licensed under the same license as Perl itself.

=head1 SEE ALSO




# vim: ts=4 sw=4

Here's a little demo program:

#!/usr/bin/perl -w
# This is a demo program to show the use of File::DiffTree
# by Ned Konz
use strict;

use File::DiffTree;
use File::Compare;

if (@ARGV != 2) {
    print STDERR "usage: $0 dir1 dir2\n";

my $dirA = shift;
my $dirB = shift;

sub onlyA { print "Only in $dirA: ", $_[0]->[0], "\n" }

sub onlyB { print "Only in $dirB: ", $_[0]->[0], "\n" }

# This will be called if names match.
# The file size or contents (or mtimes or inodes) could still be diffe
sub match {
    my $arr1 = shift;
    my $arr2 = shift;
    my $fn1 = $dirA . $arr1->[0];
    my $fn2 = $dirB . $arr2->[0];
    my $compare = $arr1->[1] <=> $arr2->[1];    # different if sizes d
    if (! $compare)
        my $retval = File::Compare::compare($fn1, $fn2);
        if ($retval == -1) {
            print STDERR "Problems opening $fn1 or $fn2: $!\n";
        $compare = $retval;
    if (! $compare) {
        print "Match: ", join('|', @$arr1), "\t", join('|', @$arr2), "
    else {
        print "Different: ", join('|', @$arr1), "\t", join('|', @$arr2
+), "\n"

File::DiffTree::diffTree($dirA, $dirB, {
    Only_A              => \&onlyA,
    Only_B              => \&onlyB,
    Match               => \&match,
    Significant_Fields  => 0,   # just name (not size, mtime or inode)
    Reject              => sub { /(?:~|\.bak|\.tmp)$/ || ! -r _ },
    Fold_Case => ($^O eq 'Win32'),      # if on OS that doesn't care l
+ike windoze

# vim: ts=4 sw=4
Log In?

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://90858]
and God said, "Let Newton be!"...

How do I use this? | Other CB clients
Other Users?
Others cooling their heels in the Monastery: (6)
As of 2018-05-21 10:11 GMT
Find Nodes?
    Voting Booth?