Re^3: Most efficient record selection method?

Update: I've copied this to where people can see it. Hopefully it will make sense to someone

Well I think I have something that works - don't really know how efficient it is so any advice would be most welcome: The Datatools::Parsecsv is a simple subroutine we use:

sub parse_csv {

    my $text  = shift;      
    my $delim = shift;
    my $type  = undef;
    if ($delim && $delim =~ /comma|tab|pipe|fixed/i){
        ($type, $delim) = $delim =~ m/\A (.+) \s - \s (.+) \z/xms;
    }

    my @new  = ();
    if ($delim){
        while ($text =~ m{
        # the first part groups the phrase inside the quotes.
        # see explanation of this pattern in MRE
        "([^\"\\]*(?:\\.[^\"\\]*)*)"$delim?
           |  ([^$delim]+)$delim?
           | $delim
        }gx){
            if (defined $+){
                push(@new, $+)
            }
            else{
                push(@new, '')
            }
        }
        push(@new, '') if substr($text, -1,1) eq ($delim);
    }
    else{
        $new[0] = $text;
    }
    return @new;      # list of values that were comma-separated
}
[download]

Anyway below is the code which I've tested on a few files and it appears to function. Please let me know if any of the code could be made more efficient - I have commented the code as much as possible.

#!c:/usr/bin/perl

use strict;
use warnings;

use Datatools::Parsecsv;

my @field_selection = qw(23 24 25 26);   #ARRAY INDEX LOCATIONS

my @file_list = (File1,
                 File2,
                 File3
);

my %field_key = ();
my %file_key  = ();
my $record_key_count = undef;

#set location of the sample record output file
my $proof_file = 'K:\Perl Development\Test Files\J02394_proofs.csv';

foreach my $file (@file_list){
    my $header     = 1;

    open (INFILE, "$file")
      || die "Cannot open file $file for reading: $!\n";

    while (my $record = <INFILE>){
        chomp $record;

        #ignore record header
        if ($header == 1){
            $header = 0;
            next;
        }

        #split fields into @fields
        my @fields     = parse_csv($record, ',');
        my $record_key = undef;

#build up concatenated key with pipe delimiter
#create field_key{field number} -> {field_key} = occurences
        foreach my $field_number (@field_selection){
          $record_key .= '|'.$fields[$field_number];
          ${$field_key{$field_number}}{$fields[$field_number]}++;
        }

#create 
#file_key{filename}->{concatenated record key} = full record
        ${$file_key{$file}}{$record_key} = $record;
    }
    close INFILE;
}

open (PROOF_FILE, ">$proof_file")
  || die " cannot write to proof file $proof_file: $!\n";

my $built_key = undef;

OPTIMISE:

    #Generic hash for value sorting hashes
    my %hash_value_to_sort = ();      

    #keep track of smallest field tests
    my %smallest_field     = ();
    #keep track of tried fields for multiple passes
    my %tried_field        = ();
    #keep track of smallest fields that revert to 
    #global_match
    my %exhausted          = ();

    my $match_result       = 0;
    #recurse built keys until match result == 1
    while ($match_result == 0){
        $built_key    = &build_key();
        $match_result = &check_key_match($built_key);
    }

goto OPTIMISE;

sub build_key{
    my $still_keys_left  = undef;
    my $appended_keys    = undef;

    #cycle through field selection
    foreach my $field_number (@field_selection){

        #when field keys exhausted
        #keep global until succesfull match 
        #resets %exhausted
        if (exists $exhausted{$field_number}){
            $appended_keys .= '\|' . 'global_match';
            next;
        }
           
        #get a key for each field and build up final key
        my $found_key = &get_field_key($field_number);
        print "$field_number:" .  ${$found_key}{'key'} . "\n";
        $appended_keys .= '\|' . ${$found_key}{'key'};

        #if field key returns global match then all 
        #field keys have been exhausted. No need to    
        #check for for smallest for this field anymore 
        #so clear %smallest-field if it relates to this 
        #field
        if (${$found_key}{'key'} =~ m/ global_match /xms){
           $exhausted{$field_number} = '1';
            if (exists $smallest_field{$field_number}){
               %smallest_field = ();
           }
        }
        #otherwise this field still has keys left
        else{
        $still_keys_left = 1;
        }

        #keep track of tried keys for this field incase 
        #we have multiple passes
        ${$tried_field{$field_number}}{${$found_key}{'key'}} = 1;

        #don't bother with defining smallest once fields 
        #exhausted go to next field
        if (exists $exhausted{$field_number}){
            next ;
        }

        #1st definition
        #flag the field number and record number of  
        #occurances that the key was found. Flag the field 
        #as smallest.
        if (not defined $smallest_field{'number'}){
            $smallest_field{'number'}      = ${$found_key}{'number'};
            $smallest_field{$field_number} = '1';
        }
        #otherwise check current number of occurences for 
        #this key and replace smallest if lower.Flag the   
        #field as smallest.
        elsif (${$found_key}{'number'} < $smallest_field{'number'}){
            $smallest_field{'number'}      = ${$found_key}{'number'};
            $smallest_field{$field_number} = '1';
        }
    }
    #if no keys left to find, close the proof file and exit 
    #the program
    if (not defined $still_keys_left){
        close PROOF_FILE;
        exit;
    }

    #otherwise return the appended key
    return $appended_keys;
}

sub get_field_key{

    #field we want to get a key for
    my $field           = shift;
    #generic hash for value sorting
    %hash_value_to_sort = %{$field_key{$field}};

    #cycle keys lowest number of occurrences first
    #this helps to optimise the record selection
    foreach my $key ( sort HashValueSort keys %hash_value_to_sort ) {
    #check if the field is flagged smallest occurence
    #only select next if key not already tried
    if (exists $smallest_field{$field}){
        if (exists ${$tried_field{$field}}{$key}){
        next;
        }
    }
    #return key and number of occurances
    return {'key'    => $key,
                'number' => $hash_value_to_sort{$key}
    };
    }

#if no valid key avaiable (i.e. all tried or all keys found)
#return a global match for this field
    return {'key'    => 'global_match',
        'number' => '0'
    };
}

sub check_key_match{

    my $check_key = shift;

    #substitute with global pattern match
    $check_key =~ s/\|global_match/\|\.\+/g;

    #recurse through each file until a record key is 
    #found in the file hash
    #print matching record from %file_key hash
    #delete matching keys from %field_key hash
    #delete matching record from $file_key hash
    foreach my $file (@file_list){
        foreach my $key (keys %{$file_key{$file}}){
        if ($key =~ m/\A $check_key \z/xms){
            my $not_dupe = &delete_keys_found($key);
                if ($not_dupe == 1){
                print PROOF_FILE ${$file_key{$file}}{$key} . "\n";
                    print STDOUT ${$file_key{$file}}{$key} . "\n";
            delete ${$file_key{$file}}{$key};
                }
                #flag match found
                return 1;
        }
    }
    }
    #flag no match found
    return 0;
}

sub delete_keys_found{
    my $delete_keys = shift;
    my @delete_keys = split ('\|', $delete_keys);
    my $found_key   = 0;

    #make up any blank last field after split on '|'
    while ($#delete_keys < $#field_selection){
        push @delete_keys,'';
    }

    #ignore empty first index after split on '|'
    my $fld_num     = 1;
    foreach my $field (@field_selection){;
    if (exists ${$field_key{$field}}{$delete_keys[$fld_num]}){
        delete ${$field_key{$field}}{$delete_keys[$fld_num]};
            $found_key = 1;
    }
    $fld_num++;
    }
    #if no keys found to delete then all were dupes 
    #so flag and do not print record
    return $found_key;
}

sub HashValueSort{

    $hash_value_to_sort{$b} <=> $hash_value_to_sort{$a};

}
[download]

Comment on Re^3: Most efficient record selection method? Select or Download Code


P is for Practical
	PerlMonks