Update: I've copied this to where people can see it. Hopefully it will make sense to someone
Well I think I have something that works - don't really know how efficient it is so any advice would be most welcome:
The Datatools::Parsecsv is a simple subroutine we use:
sub parse_csv {
my $text = shift;
my $delim = shift;
my $type = undef;
if ($delim && $delim =~ /comma|tab|pipe|fixed/i){
($type, $delim) = $delim =~ m/\A (.+) \s - \s (.+) \z/xms;
}
my @new = ();
if ($delim){
while ($text =~ m{
# the first part groups the phrase inside the quotes.
# see explanation of this pattern in MRE
"([^\"\\]*(?:\\.[^\"\\]*)*)"$delim?
| ([^$delim]+)$delim?
| $delim
}gx){
if (defined $+){
push(@new, $+)
}
else{
push(@new, '')
}
}
push(@new, '') if substr($text, -1,1) eq ($delim);
}
else{
$new[0] = $text;
}
return @new; # list of values that were comma-separated
}
Anyway below is the code which I've tested on a few files and it appears to function. Please let me know if any of the code could be made more efficient - I have commented the code as much as possible.
#!c:/usr/bin/perl
use strict;
use warnings;
use Datatools::Parsecsv;
my @field_selection = qw(23 24 25 26); #ARRAY INDEX LOCATIONS
my @file_list = (File1,
File2,
File3
);
my %field_key = ();
my %file_key = ();
my $record_key_count = undef;
#set location of the sample record output file
my $proof_file = 'K:\Perl Development\Test Files\J02394_proofs.csv';
foreach my $file (@file_list){
my $header = 1;
open (INFILE, "$file")
|| die "Cannot open file $file for reading: $!\n";
while (my $record = <INFILE>){
chomp $record;
#ignore record header
if ($header == 1){
$header = 0;
next;
}
#split fields into @fields
my @fields = parse_csv($record, ',');
my $record_key = undef;
#build up concatenated key with pipe delimiter
#create field_key{field number} -> {field_key} = occurences
foreach my $field_number (@field_selection){
$record_key .= '|'.$fields[$field_number];
${$field_key{$field_number}}{$fields[$field_number]}++;
}
#create
#file_key{filename}->{concatenated record key} = full record
${$file_key{$file}}{$record_key} = $record;
}
close INFILE;
}
open (PROOF_FILE, ">$proof_file")
|| die " cannot write to proof file $proof_file: $!\n";
my $built_key = undef;
OPTIMISE:
#Generic hash for value sorting hashes
my %hash_value_to_sort = ();
#keep track of smallest field tests
my %smallest_field = ();
#keep track of tried fields for multiple passes
my %tried_field = ();
#keep track of smallest fields that revert to
#global_match
my %exhausted = ();
my $match_result = 0;
#recurse built keys until match result == 1
while ($match_result == 0){
$built_key = &build_key();
$match_result = &check_key_match($built_key);
}
goto OPTIMISE;
sub build_key{
my $still_keys_left = undef;
my $appended_keys = undef;
#cycle through field selection
foreach my $field_number (@field_selection){
#when field keys exhausted
#keep global until succesfull match
#resets %exhausted
if (exists $exhausted{$field_number}){
$appended_keys .= '\|' . 'global_match';
next;
}
#get a key for each field and build up final key
my $found_key = &get_field_key($field_number);
print "$field_number:" . ${$found_key}{'key'} . "\n";
$appended_keys .= '\|' . ${$found_key}{'key'};
#if field key returns global match then all
#field keys have been exhausted. No need to
#check for for smallest for this field anymore
#so clear %smallest-field if it relates to this
#field
if (${$found_key}{'key'} =~ m/ global_match /xms){
$exhausted{$field_number} = '1';
if (exists $smallest_field{$field_number}){
%smallest_field = ();
}
}
#otherwise this field still has keys left
else{
$still_keys_left = 1;
}
#keep track of tried keys for this field incase
#we have multiple passes
${$tried_field{$field_number}}{${$found_key}{'key'}} = 1;
#don't bother with defining smallest once fields
#exhausted go to next field
if (exists $exhausted{$field_number}){
next ;
}
#1st definition
#flag the field number and record number of
#occurances that the key was found. Flag the field
#as smallest.
if (not defined $smallest_field{'number'}){
$smallest_field{'number'} = ${$found_key}{'number'};
$smallest_field{$field_number} = '1';
}
#otherwise check current number of occurences for
#this key and replace smallest if lower.Flag the
#field as smallest.
elsif (${$found_key}{'number'} < $smallest_field{'number'}){
$smallest_field{'number'} = ${$found_key}{'number'};
$smallest_field{$field_number} = '1';
}
}
#if no keys left to find, close the proof file and exit
#the program
if (not defined $still_keys_left){
close PROOF_FILE;
exit;
}
#otherwise return the appended key
return $appended_keys;
}
sub get_field_key{
#field we want to get a key for
my $field = shift;
#generic hash for value sorting
%hash_value_to_sort = %{$field_key{$field}};
#cycle keys lowest number of occurrences first
#this helps to optimise the record selection
foreach my $key ( sort HashValueSort keys %hash_value_to_sort ) {
#check if the field is flagged smallest occurence
#only select next if key not already tried
if (exists $smallest_field{$field}){
if (exists ${$tried_field{$field}}{$key}){
next;
}
}
#return key and number of occurances
return {'key' => $key,
'number' => $hash_value_to_sort{$key}
};
}
#if no valid key avaiable (i.e. all tried or all keys found)
#return a global match for this field
return {'key' => 'global_match',
'number' => '0'
};
}
sub check_key_match{
my $check_key = shift;
#substitute with global pattern match
$check_key =~ s/\|global_match/\|\.\+/g;
#recurse through each file until a record key is
#found in the file hash
#print matching record from %file_key hash
#delete matching keys from %field_key hash
#delete matching record from $file_key hash
foreach my $file (@file_list){
foreach my $key (keys %{$file_key{$file}}){
if ($key =~ m/\A $check_key \z/xms){
my $not_dupe = &delete_keys_found($key);
if ($not_dupe == 1){
print PROOF_FILE ${$file_key{$file}}{$key} . "\n";
print STDOUT ${$file_key{$file}}{$key} . "\n";
delete ${$file_key{$file}}{$key};
}
#flag match found
return 1;
}
}
}
#flag no match found
return 0;
}
sub delete_keys_found{
my $delete_keys = shift;
my @delete_keys = split ('\|', $delete_keys);
my $found_key = 0;
#make up any blank last field after split on '|'
while ($#delete_keys < $#field_selection){
push @delete_keys,'';
}
#ignore empty first index after split on '|'
my $fld_num = 1;
foreach my $field (@field_selection){;
if (exists ${$field_key{$field}}{$delete_keys[$fld_num]}){
delete ${$field_key{$field}}{$delete_keys[$fld_num]};
$found_key = 1;
}
$fld_num++;
}
#if no keys found to delete then all were dupes
#so flag and do not print record
return $found_key;
}
sub HashValueSort{
$hash_value_to_sort{$b} <=> $hash_value_to_sort{$a};
}