Beefy Boxes and Bandwidth Generously Provided by pair Networks
Keep It Simple, Stupid
 
PerlMonks  

join - join two files according to a common key

by Corion (Pope)
on Jul 12, 2007 at 15:16 UTC ( #626257=sourcecode: print w/replies, xml ) Need Help??
Category: Utility Scripts
Author/Contact Info Corion
Description:

A counterpart to part, it allows you to join two files side by side according to common values. This is similar to the join UNIX command except that the join command expects the input files to be sorted according to the keys, while this program will slurp the second file into a hash and then output the result according to the order of the first file.

Optionally (and untested) it can use a tied hash as on-disk storage in the case that the storage for the files is larger than the available RAM.

#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use File::Temp qw( :POSIX );

use vars qw($VERSION);
$VERSION = '0.03';

# Try to load Pod::Usage and install a fallback if it doesn't exist
eval {
    require Pod::Usage;
    Pod::Usage->import();
    1;
} or do {
    *pod2usage = sub {
        die "Error in command line.\n";
    };
};

GetOptions(
    "disk" => \my $do_tie,
    "on|j=s" => \my $joincol,
    "left|j1|1=s" => \my @left_key_cols,
    "right|j2|2=s" => \my @right_key_cols,
    "output|o" => \my @output_fieldlist,
    "delimiter|t|d=s" => \my $delimiter,
    "output-delimiter|od" => \my $output_delimiter,
    "missing|v=i" => \my @missing,
    "null|n=s" => \my $nullvalue,
    "warn-on-duplicates|u=s" => \my @warn_on_duplicates,
    "die-on-duplicates|s=s" => \my @die_on_duplicates,
    "smart-duplicates" => \my $smart_duplicates,
    "progress|verbose|p" => \my $progress,
    'help'              => \my $help,
    'version'           => \my $version,
) or pod2usage(2);
pod2usage(1) if $help;
if (defined $version) {
    print "$VERSION\n";
    exit 0;
};
pod2usage("$0: No files given.")  if ((@ARGV == 0) && (-t STDIN));

$delimiter ||= "\t";
$nullvalue ||= "";
$output_delimiter ||= $delimiter;
$joincol ||= 1;
if (! @left_key_cols) { @left_key_cols = $joincol; };
if (! @right_key_cols) { @right_key_cols = $joincol; };
my %output_missing = map { $_ => $_ } @missing;

my %col_count;

for (\@left_key_cols, \@right_key_cols) {
    @$_ = map { split /,/ } @$_;
};
if (@left_key_cols != @right_key_cols) {
    local $" = ",";
    warn "Left  keys: @left_key_cols\n";
    warn "Right keys: @right_key_cols\n";
    die "Differing number of key columns between left and right - that
+ is wrong.\n";
};

# Adjust the indices for the join columns:
for (@left_key_cols, @right_key_cols) {
    $_--
};

my @output_cols = ('1.*','2.%');
if (@output_fieldlist) {
    @output_cols = map { split /,/ } @output_fieldlist;
};

if (@warn_on_duplicates and not @die_on_duplicates) {
    @warn_on_duplicates = (2);
};

my %on_duplicate;
$on_duplicate{ $_ } = sub { warn "Duplicate key '$_[0]' for row >>$_[1
+]<< in file $_[2]\n" } for @warn_on_duplicates;
$on_duplicate{ $_ } = sub { die  "Duplicate key '$_[0]' for row >>$_[1
+]<< in file $_[2]\n" } for @die_on_duplicates;

my %right; # The index into the right file
my %seen;  # The keys we processed from the left file

my @CLEANUP;
if ($do_tie) {
    require DB_File;
    my $rn = tmpnam;
    tie %right, 'DB_File', $rn;
    my $sn = tmpnam;
    tie %seen, 'DB_File', $sn;
    push @CLEANUP, $rn, $sn;
};
END {
    if ($do_tie) {
        untie %right;
        untie %seen;
    };
    for (@CLEANUP) {
        unlink $_ or warn "Couldn't remove tempfile '$_' : $!\n";
    };
};

my ($left,$right) = @ARGV;

# Read the right file into the hash
open my $rfh, "<", $right
    or die "Couldn't read '$right': $!";
open my $lfh, "<", $left
    or die "Couldn't read '$left': $!";

sub key {
    my ($cols,$col_info) = @_;
    return join $delimiter, @{ $cols }[ @$col_info ];
};

sub output {
    my @lr = (@_);
    my @output = map { /^(\d)\.(\d+)/ or die "Invalid column spec '$_'
+"; $lr[$1-1]->[$2-1] } @output_cols;
    print join($delimiter, @output), "\n";
};

sub expand_output_columns {
    my (@list) = @_;

    my %keycols = (
         1 => +{ map { $_+1 => 1 } @left_key_cols },
         2 => +{ map { $_+1 => 1 } @right_key_cols },
    );

    my @res = map { /(\d)\.\*/ ? (map { "$1.$_" } (1..$col_count{ $1 }
+))
                  : /(\d)\.\%/ ? (map { "$1.$_" } grep { ! exists $key
+cols{$1}{$_}} (1..$col_count{ $1 }))
                  : $_
              } @list;
    @res
};

warn "Reading $right"
    if $progress;
while (<$rfh>) {
    chomp;
    my @right_cols = split /\Q$delimiter\E/;
    $col_count{ 2 } ||= @right_cols;
    my $key = key( \@right_cols, \@right_key_cols );
    if ($right{ $key } and $on_duplicate{2}) {
        my $diff = $right{ $key } ne $_;
        if ($diff or !$smart_duplicates) {
            $on_duplicate{2}->($key,$_,$right)
        };
    };
    $right{ $key } = $_;
};

# Read the left file and output the generated lines (if any)
warn "Processing $left"
    if $progress;
my $expanded_output_columns;
while (<$lfh>) {
    chomp;
    my @left_cols = split /\Q$delimiter\E/;
    $col_count{ 1 } ||= @left_cols;
    my $key = key( \@left_cols, \@left_key_cols );

    if ($seen{ $key } and $on_duplicate{1}) {
        my $diff = $seen{ $key } ne $_;
        if ($diff or !$smart_duplicates) {
            $on_duplicate{1}->($key,$_,$left)
        };
    };
    $seen{ $key }++;
    my $out;
    my @right_cols;
    if (exists $right{ $key }) {
        @right_cols = split /\Q$delimiter\E/, $right{ $key };
    } else {
        @right_cols = ($nullvalue) x $col_count{ 2 };
    };

    if (exists $right{ $key } or $output_missing{1}) {
        if (! $expanded_output_columns) {
            @output_cols = expand_output_columns(@output_cols);
            $expanded_output_columns++;
        };
        output \@left_cols, \@right_cols;
    };
};

@output_cols = expand_output_columns(@output_cols);

if ($output_missing{2}) {
    warn "Writing right-missing keys"
        if $progress;
    my @left_cols = ($nullvalue) x $col_count{ 1 };
    while ((my ($key,$v)) = each %right) {
        if (! $seen{ $key }) {
            my @right_cols = split /\Q$delimiter\E/, $v;
            output \@left_cols, \@right_cols;
        };
    };
};

__END__

=head1 NAME

join - join two files by common key columns

=head1 SYNOPSIS

  join.pl [OPTIONS] FILE1 FILE2

  join.pl --on 1,2 file1.txt file2.txt

  join.pl --left 1,2 --right 3,4 file1.txt file2.txt

=head1 OPTIONS

=item B<--on COL> - specify a single column number to join both files 
+on

This is a shorthand for C<--left COL --right COL>

=item B<--missing FILE> - output rows only in one file

C<--missing 1> will output rows that only exist in the left file.

=item B<--null VAL> - string for the null value

When a row is output through the C<--missing> option, the missing
values will be replaced by the value given.

The default is an empty string, "".

Example: --null NULL

=item B<--warn-on-duplicates FILE> - output a warning if duplicate key
+s are found in the file

=item B<--die-on-duplicates FILE> - die if duplicate keys are found in
+ the file

These options govern how the program behaves when it encounters
duplicate keys in a file.

=item B<--smart-duplicates> - be smart about duplicates

This setting enables smart duplicate handling that will
only consider a row as duplicate if the key is identical but
the remaining values differ.

=item B<--left COL1,COL2> - specify key columns for the left file

=item B<--right COL1,COL2> - specify key columns for the right file

The column counts starting at 1. The default column is 1.

=item B<--output COL1,COL2> - specify columns to output

If you want to reorder or omit columns use this to
list the columns. Each column must be in the format
C<M.N> where M is either 1 for the left file or
2 for the right file, and N is the column number.

There are two shorthands:

C<M.*> will include all columns from the source file
in source order.

C<M.%> will include all columns from the source file
except the key columns in source order.

The default is C<1.* 2.%>, which will append
the non-key columns of the right file to the left file.

=item B<--delimiter DEL> - specify column input delimiter

The default input delimiter is a tab. No automatic
delimiter recognition is done yet.

=item B<--output-delimiter DEL> - specify output column delimiter

The output column delimiter defaults to the input
column delimiter.

=item B<--progress> - be verbose in the progress

Some diagnostic messages will be output to STDERR
as the program progresses.

=item B<--disk> - use disk memory for joining instead of RAM

This will use disk memory for storing the index instead
of using RAM.

=item B<--version> - print program version

Outputs the program version.

=item B<--help> - print this page

Outputs this help text.


Replies are listed 'Best First'.
Re: join - join two files according to a common key
by Anonymous Monk on Mar 12, 2018 at 16:09 UTC
    So, using your code above, I ran
    ./join.pl --delimiter , --on 1 --missing 1 --missing 2 --warn-on-dupli +cates 1 --warn-on-duplicates 2 --progress usa_ldap.csv HPSMcmdb.csv
    In the case where the key was missing in file 1, there is no key output for the line. I guess what would work for my data is if the key from the right file were output in the left key column (which for me is column 1). I am looking into the code to figure out how to code that specific special case - but if anyone already has a solution, I would really appreciate it. Thank you
      When I wrote In the case where the key was missing in file 1 it might be better worded "when there is a line with a key in file 2 that does not have a corresponding line in file 1".
Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://626257]
help
Chatterbox?
[vedagiri89]: that is what i can't getting solution
[vedagiri89]: recently purchased centos server and doing migration of app
[choroba]: did you copy any dependencies?
[choroba]: all XS code needs to be recompiled
[vedagiri89]: install perl5.16 in the new server and try to run appache
[hippo]: Which version of CentOS?
[vedagiri89]: how to fix: Attempt to reload DynaLoader.pm aborted
[hippo]: Ah, presumably 7 then.
[vedagiri89]: CentOS Linux release 7.0.1406 (Core)

How do I use this? | Other CB clients
Other Users?
Others meditating upon the Monastery: (8)
As of 2018-06-19 11:27 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    Should cpanminus be part of the standard Perl release?



    Results (113 votes). Check out past polls.

    Notices?