#!c:/usr/bin/perl use strict; use warnings; use Datatools::Parsecsv; my @field_selection = qw(23 24 25 26); #ARRAY INDEX LOCATIONS my @file_list = (File1, File2, File3 ); my %field_key = (); my %file_key = (); my $record_key_count = undef; #set location of the sample record output file my $proof_file = 'K:\Perl Development\Test Files\J02394_proofs.csv'; foreach my $file (@file_list){ my $header = 1; open (INFILE, "$file") || die "Cannot open file $file for reading: $!\n"; while (my $record = ){ chomp $record; #ignore record header if ($header == 1){ $header = 0; next; } #split fields into @fields my @fields = parse_csv($record, ','); my $record_key = undef; #build up concatenated key with pipe delimiter #create field_key{field number} -> {field_key} = occurences foreach my $field_number (@field_selection){ $record_key .= '|'.$fields[$field_number]; ${$field_key{$field_number}}{$fields[$field_number]}++; } #create #file_key{filename}->{concatenated record key} = full record ${$file_key{$file}}{$record_key} = $record; } close INFILE; } open (PROOF_FILE, ">$proof_file") || die " cannot write to proof file $proof_file: $!\n"; my $built_key = undef; OPTIMISE: #Generic hash for value sorting hashes my %hash_value_to_sort = (); #keep track of smallest field tests my %smallest_field = (); #keep track of tried fields for multiple passes my %tried_field = (); #keep track of smallest fields that revert to #global_match my %exhausted = (); my $match_result = 0; #recurse built keys until match result == 1 while ($match_result == 0){ $built_key = &build_key(); $match_result = &check_key_match($built_key); } goto OPTIMISE; sub build_key{ my $still_keys_left = undef; my $appended_keys = undef; #cycle through field selection foreach my $field_number (@field_selection){ #when field keys exhausted #keep global until succesfull match #resets %exhausted if (exists $exhausted{$field_number}){ $appended_keys .= '\|' . 'global_match'; next; } #get a key for each field and build up final key my $found_key = &get_field_key($field_number); print "$field_number:" . ${$found_key}{'key'} . "\n"; $appended_keys .= '\|' . ${$found_key}{'key'}; #if field key returns global match then all #field keys have been exhausted. No need to #check for for smallest for this field anymore #so clear %smallest-field if it relates to this #field if (${$found_key}{'key'} =~ m/ global_match /xms){ $exhausted{$field_number} = '1'; if (exists $smallest_field{$field_number}){ %smallest_field = (); } } #otherwise this field still has keys left else{ $still_keys_left = 1; } #keep track of tried keys for this field incase #we have multiple passes ${$tried_field{$field_number}}{${$found_key}{'key'}} = 1; #don't bother with defining smallest once fields #exhausted go to next field if (exists $exhausted{$field_number}){ next ; } #1st definition #flag the field number and record number of #occurances that the key was found. Flag the field #as smallest. if (not defined $smallest_field{'number'}){ $smallest_field{'number'} = ${$found_key}{'number'}; $smallest_field{$field_number} = '1'; } #otherwise check current number of occurences for #this key and replace smallest if lower.Flag the #field as smallest. elsif (${$found_key}{'number'} < $smallest_field{'number'}){ $smallest_field{'number'} = ${$found_key}{'number'}; $smallest_field{$field_number} = '1'; } } #if no keys left to find, close the proof file and exit #the program if (not defined $still_keys_left){ close PROOF_FILE; exit; } #otherwise return the appended key return $appended_keys; } sub get_field_key{ #field we want to get a key for my $field = shift; #generic hash for value sorting %hash_value_to_sort = %{$field_key{$field}}; #cycle keys lowest number of occurrences first #this helps to optimise the record selection foreach my $key ( sort HashValueSort keys %hash_value_to_sort ) { #check if the field is flagged smallest occurence #only select next if key not already tried if (exists $smallest_field{$field}){ if (exists ${$tried_field{$field}}{$key}){ next; } } #return key and number of occurances return {'key' => $key, 'number' => $hash_value_to_sort{$key} }; } #if no valid key avaiable (i.e. all tried or all keys found) #return a global match for this field return {'key' => 'global_match', 'number' => '0' }; } sub check_key_match{ my $check_key = shift; #substitute with global pattern match $check_key =~ s/\|global_match/\|\.\+/g; #recurse through each file until a record key is #found in the file hash #print matching record from %file_key hash #delete matching keys from %field_key hash #delete matching record from $file_key hash foreach my $file (@file_list){ foreach my $key (keys %{$file_key{$file}}){ if ($key =~ m/\A $check_key \z/xms){ my $not_dupe = &delete_keys_found($key); if ($not_dupe == 1){ print PROOF_FILE ${$file_key{$file}}{$key} . "\n"; print STDOUT ${$file_key{$file}}{$key} . "\n"; delete ${$file_key{$file}}{$key}; } #flag match found return 1; } } } #flag no match found return 0; } sub delete_keys_found{ my $delete_keys = shift; my @delete_keys = split ('\|', $delete_keys); my $found_key = 0; #make up any blank last field after split on '|' while ($#delete_keys < $#field_selection){ push @delete_keys,''; } #ignore empty first index after split on '|' my $fld_num = 1; foreach my $field (@field_selection){; if (exists ${$field_key{$field}}{$delete_keys[$fld_num]}){ delete ${$field_key{$field}}{$delete_keys[$fld_num]}; $found_key = 1; } $fld_num++; } #if no keys found to delete then all were dupes #so flag and do not print record return $found_key; } sub HashValueSort{ $hash_value_to_sort{$b} <=> $hash_value_to_sort{$a}; }