Beefy Boxes and Bandwidth Generously Provided by pair Networks
Just another Perl shrine
 
PerlMonks  

Re: Join files using perl

by choroba (Cardinal)
on Jan 11, 2013 at 23:15 UTC ( [id://1012993]=note: print w/replies, xml ) Need Help??


in reply to Join files using perl

Years ago, I wrote this, and I still use it:
#!/usr/bin/perl # Author : E. Choroba use strict; use warnings; no warnings qw(uninitialized); use Getopt::Std; our $VERSION = 1.2; =item chjoin Normal C<join> from I<coreutils> was not enough for us because it worked strange on unsorted or duplicate lines. This one suits us better. =cut getopts('ha:e:it:v:o:1:2:D:'); our ($opt_h, $opt_a, $opt_e, $opt_i, $opt_t, $opt_v, $opt_o, $opt_1, $opt_2, $opt_D); if ($opt_h) { print<<EOH; Usage: chjoin [options] file1 file2 where options are: (SIDE is 1 or 2) (same as join): -a SIDE : print also unpairable lines coming from file SIDE -e EMPTY : replace missing fields with EMPTY -h : this help -i : ignore case -t CHAR : input separator (default <tab>) -v SIDE : process unpairable lines from file SIDE (not to be use +d with -a) -o FORMAT : output format (of the form SIDE.FIELD,SIDE.FIELD, default : match 1.* 2.*) -1 FIELD : join on field FIELD in file1 (default 1) -2 FIELD : join on field FIELD in file2 (default 1) (the new one): -D method : how to treat duplicate entries: Method | Duplicates in | Matching | file1 | file2 | -------+------ +-------+------------------------------ +------- 0 | no | warn | match with 1st occurence in f +ile1 1 | no | yes | generate all combinations 2 | yes | yes | match one-to-one in respectiv +e order 3 | yes | yes | generate all combinations EOH exit; } die "You must specify two files. See chjoin -h.\n" if @ARGV != 2; die "Invalid method\n" if $opt_D and $opt_D !~ /^[0-3]$/; warn "Warning: both -a and -v used.\n" if $opt_a and $opt_v; open my $F1, $ARGV[0] or die "Cannot open $ARGV[0].\n"; open my $F2, $ARGV[1] or die "Cannot open $ARGV[1].\n"; $opt_t //= "\t"; $opt_1 ||= 1; $opt_1--; $opt_2 ||= 1; $opt_2--; # Parse file 1 my %hash; while (my $line = <$F1>) { chomp $line; my @fields = split/\Q$opt_t\E/, $line; my $field = $fields[$opt_1]; $field = lc $field if $opt_i; if ($opt_D < 2) { if (exists $hash{$field}) { my $j = join $opt_t, @{ $hash{$field} }; die <<"EOD"; FATAL: Duplicate entry in file1: <$fields[$opt_1]> 1: $j 2: $line EOD } else { $hash{$field} = \@fields; } } else { $hash{$field} = [] unless exists$hash{$field}; push @{ $hash{$field} }, \@fields; } } # Parse file 2 my @output; my @alines; my %to_undef; while (my$line = <$F2>) { chomp $line; my @fields = split/\Q$opt_t\E/, $line; my $field = $fields[$opt_2]; $field = lc $field if $opt_i; if (exists $hash{$field}) { # match if (defined $hash{$field}) { # first match for D0, else just m +atch if ($opt_v != 1) { if ($opt_D == 3) { for my $ref (@{ $hash{$field} }) { push @output, join ($opt_t, @$ref), $line unless $opt_v == 2; } } else { my @match; if ($opt_D < 2){ @match = @{ $hash{$field} }; } else { @match = @{ shift @{ $hash{$field} } }; } push @output, join($opt_t, @match), $line unless $opt_v == 2; } } $hash{$field} = undef if ! $opt_D or $opt_D > 1 && !@{ $hash{$field} }; $to_undef{$field} = 1 if $opt_D > 0; } else { # second match for D0 warn "Warning: Duplicate entry in file2: <$fields[$opt_2]> +\n$line\n" unless $opt_D; push @output, q(), $line . "\n" if $opt_v == 2; push @alines, $line if $opt_a == 2; } } else { push @output, q(), $line . "\n" if $opt_v == 2; push @alines, $line if $opt_a == 2; } } @hash{ keys %to_undef } = undef; if ($opt_v == 1) { if ($opt_D < 2) { @output = map { join($opt_t, @{ $hash{$_} }), "\n"; } grep defined $hash{$_}, keys %hash; } else { for (grep defined $hash{$_}, keys %hash) { push @output, join($opt_t, @$_), "\n" for @{ $hash{$_} }; } } } if ($opt_a == 1) { if ($opt_D < 2) { @alines = map { join($opt_t, @{ $hash{$_} }); } grep defined $hash{$_}, keys %hash; } else { for (grep defined $hash{$_}, keys %hash) { push @alines, join($opt_t, @$_) for @{ $hash{$_} }; } } } # Format output chomp @output; for (my $i = 0; $i < @output; $i += 2) { my $out; my @fields1 = split /\Q$opt_t\E/, $output[$i]; my @fields2 = split /\Q$opt_t\E/, $output[$i + 1]; unless ($opt_o) { if ($opt_v == 1) { $opt_o = "1." . ($opt_1 + 1); } else { $opt_o = "2." . ($opt_2 + 1); } for (my $i = 1; $i <= @fields1; $i++) { $opt_o .= ",1.$i" unless $i == $opt_1 + 1; } for (my $i = 1; $i <= @fields2; $i++){ $opt_o .= ",2.$i" unless $i == $opt_2 + 1; } } for my $outf (split /,/ => $opt_o) { if ($outf =~ /([12])\.([0-9]+)/) { if ($1 eq'1') { ($out .= (($_ = $fields1[$2 - 1]) ? $_ : $opt_e) . $opt_t) unless $opt_v == 2; } else { ($out .= (($_ = $fields2[$2 - 1]) ? $_ : $opt_e) . $opt_t) unless $opt_v == 1; } } else { $out = "$outf$opt_t"; } } $out =~ s/.$/\n/; print $out; } print "$_\n" for @alines;
لսႽ† ᥲᥒ⚪⟊Ⴙᘓᖇ Ꮅᘓᖇ⎱ Ⴙᥲ𝇋ƙᘓᖇ

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://1012993]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others cooling their heels in the Monastery: (8)
As of 2024-04-23 13:02 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found