http://www.perlmonks.org?node_id=626257
Category: Utility Scripts
Author/Contact Info Corion
Description:

A counterpart to part, it allows you to join two files side by side according to common values. This is similar to the join UNIX command except that the join command expects the input files to be sorted according to the keys, while this program will slurp the second file into a hash and then output the result according to the order of the first file.

Optionally (and untested) it can use a tied hash as on-disk storage in the case that the storage for the files is larger than the available RAM.

#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Temp qw( :POSIX );

use vars qw($VERSION);
$VERSION = '0.03';

# Try to load Pod::Usage and install a fallback if it doesn't exist
eval {
    require Pod::Usage;
    Pod::Usage->import();
    1;
} or do {
    *pod2usage = sub {
        die "Error in command line.\n";
    };
};

GetOptions(
    "disk" => \my $do_tie,
    "on|j=s" => \my $joincol,
    "left|j1|1=s" => \my @left_key_cols,
    "right|j2|2=s" => \my @right_key_cols,
    "output|o" => \my @output_fieldlist,
    "delimiter|t|d=s" => \my $delimiter,
    "output-delimiter|od" => \my $output_delimiter,
    "missing|v=i" => \my @missing,
    "null|n=s" => \my $nullvalue,
    "warn-on-duplicates|u=s" => \my @warn_on_duplicates,
    "die-on-duplicates|s=s" => \my @die_on_duplicates,
    "smart-duplicates" => \my $smart_duplicates,
    "progress|verbose|p" => \my $progress,
    'help'              => \my $help,
    'version'           => \my $version,
) or pod2usage(2);
pod2usage(1) if $help;
if (defined $version) {
    print "$VERSION\n";
    exit 0;
};
pod2usage("$0: No files given.")  if ((@ARGV == 0) && (-t STDIN));

$delimiter ||= "\t";
$nullvalue ||= "";
$output_delimiter ||= $delimiter;
$joincol ||= 1;
if (! @left_key_cols) { @left_key_cols = $joincol; };
if (! @right_key_cols) { @right_key_cols = $joincol; };
my %output_missing = map { $_ => $_ } @missing;

my %col_count;

for (\@left_key_cols, \@right_key_cols) {
    @$_ = map { split /,/ } @$_;
};
if (@left_key_cols != @right_key_cols) {
    local $" = ",";
    warn "Left  keys: @left_key_cols\n";
    warn "Right keys: @right_key_cols\n";
    die "Differing number of key columns between left and right - that
+ is wrong.\n";
};

# Adjust the indices for the join columns:
for (@left_key_cols, @right_key_cols) {
    $_--
};

my @output_cols = ('1.*','2.%');
if (@output_fieldlist) {
    @output_cols = map { split /,/ } @output_fieldlist;
};

if (@warn_on_duplicates and not @die_on_duplicates) {
    @warn_on_duplicates = (2);
};

my %on_duplicate;
$on_duplicate{ $_ } = sub { warn "Duplicate key '$_[0]' for row >>$_[1
+]<< in file $_[2]\n" } for @warn_on_duplicates;
$on_duplicate{ $_ } = sub { die  "Duplicate key '$_[0]' for row >>$_[1
+]<< in file $_[2]\n" } for @die_on_duplicates;

my %right; # The index into the right file
my %seen;  # The keys we processed from the left file

my @CLEANUP;
if ($do_tie) {
    require DB_File;
    my $rn = tmpnam;
    tie %right, 'DB_File', $rn;
    my $sn = tmpnam;
    tie %seen, 'DB_File', $sn;
    push @CLEANUP, $rn, $sn;
};
END {
    if ($do_tie) {
        untie %right;
        untie %seen;
    };
    for (@CLEANUP) {
        unlink $_ or warn "Couldn't remove tempfile '$_' : $!\n";
    };
};

my ($left,$right) = @ARGV;

# Read the right file into the hash
open my $rfh, "<", $right
    or die "Couldn't read '$right': $!";
open my $lfh, "<", $left
    or die "Couldn't read '$left': $!";

sub key {
    my ($cols,$col_info) = @_;
    return join $delimiter, @{ $cols }[ @$col_info ];
};

sub output {
    my @lr = (@_);
    my @output = map { /^(\d)\.(\d+)/ or die "Invalid column spec '$_'
+"; $lr[$1-1]->[$2-1] } @output_cols;
    print join($delimiter, @output), "\n";
};

sub expand_output_columns {
    my (@list) = @_;

    my %keycols = (
         1 => +{ map { $_+1 => 1 } @left_key_cols },
         2 => +{ map { $_+1 => 1 } @right_key_cols },
    );

    my @res = map { /(\d)\.\*/ ? (map { "$1.$_" } (1..$col_count{ $1 }
+))
                  : /(\d)\.\%/ ? (map { "$1.$_" } grep { ! exists $key
+cols{$1}{$_}} (1..$col_count{ $1 }))
                  : $_
              } @list;
    @res
};

warn "Reading $right"
    if $progress;
while (<$rfh>) {
    chomp;
    my @right_cols = split /\Q$delimiter\E/;
    $col_count{ 2 } ||= @right_cols;
    my $key = key( \@right_cols, \@right_key_cols );
    if ($right{ $key } and $on_duplicate{2}) {
        my $diff = $right{ $key } ne $_;
        if ($diff or !$smart_duplicates) {
            $on_duplicate{2}->($key,$_,$right)
        };
    };
    $right{ $key } = $_;
};

# Read the left file and output the generated lines (if any)
warn "Processing $left"
    if $progress;
my $expanded_output_columns;
while (<$lfh>) {
    chomp;
    my @left_cols = split /\Q$delimiter\E/;
    $col_count{ 1 } ||= @left_cols;
    my $key = key( \@left_cols, \@left_key_cols );

    if ($seen{ $key } and $on_duplicate{1}) {
        my $diff = $seen{ $key } ne $_;
        if ($diff or !$smart_duplicates) {
            $on_duplicate{1}->($key,$_,$left)
        };
    };
    $seen{ $key }++;
    my $out;
    my @right_cols;
    if (exists $right{ $key }) {
        @right_cols = split /\Q$delimiter\E/, $right{ $key };
    } else {
        @right_cols = ($nullvalue) x $col_count{ 2 };
    };

    if (exists $right{ $key } or $output_missing{1}) {
        if (! $expanded_output_columns) {
            @output_cols = expand_output_columns(@output_cols);
            $expanded_output_columns++;
        };
        output \@left_cols, \@right_cols;
    };
};

@output_cols = expand_output_columns(@output_cols);

if ($output_missing{2}) {
    warn "Writing right-missing keys"
        if $progress;
    my @left_cols = ($nullvalue) x $col_count{ 1 };
    while ((my ($key,$v)) = each %right) {
        if (! $seen{ $key }) {
            my @right_cols = split /\Q$delimiter\E/, $v;
            output \@left_cols, \@right_cols;
        };
    };
};

__END__

=head1 NAME

join - join two files by common key columns

=head1 SYNOPSIS

  join.pl [OPTIONS] FILE1 FILE2

  join.pl --on 1,2 file1.txt file2.txt

  join.pl --left 1,2 --right 3,4 file1.txt file2.txt

=head1 OPTIONS

=item B<--on COL> - specify a single column number to join both files 
+on

This is a shorthand for C<--left COL --right COL>

=item B<--missing FILE> - output rows only in one file

C<--missing 1> will output rows that only exist in the left file.

=item B<--null VAL> - string for the null value

When a row is output through the C<--missing> option, the missing
values will be replaced by the value given.

The default is an empty string, "".

Example: --null NULL

=item B<--warn-on-duplicates FILE> - output a warning if duplicate key
+s are found in the file

=item B<--die-on-duplicates FILE> - die if duplicate keys are found in
+ the file

These options govern how the program behaves when it encounters
duplicate keys in a file.

=item B<--smart-duplicates> - be smart about duplicates

This setting enables smart duplicate handling that will
only consider a row as duplicate if the key is identical but
the remaining values differ.

=item B<--left COL1,COL2> - specify key columns for the left file

=item B<--right COL1,COL2> - specify key columns for the right file

The column counts starting at 1. The default column is 1.

=item B<--output COL1,COL2> - specify columns to output

If you want to reorder or omit columns use this to
list the columns. Each column must be in the format
C<M.N> where M is either 1 for the left file or
2 for the right file, and N is the column number.

There are two shorthands:

C<M.*> will include all columns from the source file
in source order.

C<M.%> will include all columns from the source file
except the key columns in source order.

The default is C<1.* 2.%>, which will append
the non-key columns of the right file to the left file.

=item B<--delimiter DEL> - specify column input delimiter

The default input delimiter is a tab. No automatic
delimiter recognition is done yet.

=item B<--output-delimiter DEL> - specify output column delimiter

The output column delimiter defaults to the input
column delimiter.

=item B<--progress> - be verbose in the progress

Some diagnostic messages will be output to STDERR
as the program progresses.

=item B<--disk> - use disk memory for joining instead of RAM

This will use disk memory for storing the index instead
of using RAM.

=item B<--version> - print program version

Outputs the program version.

=item B<--help> - print this page

Outputs this help text.