Possibly change the permutation machine so that each "permutation" array is treated as a wheel, reducing the number of arrays needing to be created. I think the walking is the time sink; not creating the walkways.
#!/usr/bin/perl
use warnings;
use strict;
use FileHandle;
use Clone 'clone';
use Data::Compare;
$|++;
my @files = qw( File1 File2 File3 );
my $sig_flds = [ 0 .. 2 ]; # config, the fields that we care about
@$sig_flds = sort @$sig_flds;
my $answer = [];
my $freq = {};
my $input; # AoA ref
# put data into an array
for my $file (@files) {
my $fh = FileHandle->new( $file, 'r' ) or die "Cannot open $file";
scalar <$fh>; # eat header line
my $data = [<$fh>];
chomp @$data;
$_ = [ parse_csv( $_, ',' ) ] for @$data;
close $fh or die "Cannot close $file";
push @$input, @$data;
}
my $sanity_check = make_ck_list( $sig_flds, $input);
sub make_ck_list {
my ( $sig_flds, $input) = @_;
my %answer_check;
for my $rec ( @$input ) {
for my $fld ( @$sig_flds ) {
$answer_check{ $fld }{ $rec->[$fld] } = 1;
}
}
return \%answer_check;
}
print "Orig unscored input size(", scalar(@$input), "):$/";
display( $sig_flds, $input, $freq );
sample( $sig_flds, $input, $answer, $freq );
print "Solution size(", scalar(@$answer), "):$/";
display( $sig_flds, $answer, $freq );
my $answer_list = make_ck_list( $sig_flds, $answer);
print "Bad answer$/" unless Compare( $answer_list, $sanity_check );
exit;
# create a smallest possible sample with all values represented.
sub sample {
my ( $sig_flds, $input, $answer, $freq ) = @_;
local $_;
my $part_answer = [];
$input = _reduce_input( $sig_flds, $input, $part_answer, $freq );
# are we lucky/done ? return
do { @$answer = @$part_answer; return } if !defined $input;
# reassemble data w/o duplicates
@$input = ( @$input, @$part_answer );
# partition into separate problems
my $partition = partition( $sig_flds, $input );
# solve each problem, dumping all the solutions together
# process each partition
my $ct;
for my $part (@$partition) {
# print "Starting partition", ++$ct, $/;
$freq = {};
$part_answer = [];
# this does a little extra work this time
$input = _reduce_input( $sig_flds, $part, $part_answer, $freq
+);
solve_part( $sig_flds, $input, $part_answer, $freq )
if defined $input;
push @$answer, @$part_answer;
$input = [];
}
return;
}
# solve_part -- solve a partition of data by going
# through all permutations until a good solution
# is found
sub solve_part {
my ( $sig_flds, $input, $answer, $freq ) = @_;
$answer = [] if not defined $answer;
my $part_answer = $answer;
# factor to predict size of solution
my ( $best_possible_size, $worst_possible_size )
= size_answer( $sig_flds, $input, $answer, $freq );
# master needs list;
my %master_needs = %$freq;
# delete @$part_answer items from needs
for my $rec (@$part_answer) {
for my $fld (@$sig_flds) {
if ( exists $master_needs{$fld}->{ $rec->[$fld] } ) {
delete $master_needs{$fld}->{ $rec->[$fld] };
}
}
}
my $iter_perms = permute(@$input);
my $cur_best_size = $worst_possible_size;
my $best_answer;
my $total_perms = factorial( scalar @$input );
my $ct;
while ( my @perm = $iter_perms->() ) {
++$ct;
my $curr_ans;
my $needs = clone( \%master_needs );
@$curr_ans = @$part_answer;
while (@perm) {
my $curr_rec = pop @perm;
if ( useful( $sig_flds, $curr_rec, $needs ) ) {
push @$curr_ans, $curr_rec;
for my $fld (@$sig_flds) {
delete $needs->{$fld}{ $curr_rec->[$fld] };
}
}
}
if ( $ct % 1000 == 0 ) { # XXX
#print
# " Permutation: $ct/$total_perms "
# . "target size: $best_possible_size best $cur_best_si
+ze$/";
}
if ( empty( $sig_flds, $needs ) ) {
if ( @$curr_ans < $cur_best_size ) {
$best_answer = $curr_ans;
$cur_best_size = @$curr_ans;
#print "$/Permutation $ct$/";
#print "We have a new contender: ";
#print "Target size: $best_possible_size ";
#print "Contender's size: $cur_best_size$/";
if ( $cur_best_size <= $best_possible_size ) {
@$answer = @$best_answer;
return;
}
#display( $sig_flds, $best_answer, $freq );
}
}
}
@$answer = @$best_answer;
return;
}
# size_answer -- look at the data and determine:
# 1. the smallest possible size of an answer (or less if we must gues
+s),
# 2. the largest possible size of an answer (or more if we must guess
+),
# This is an optimization: if smallest is accurate, we can break off
# when one of the best solutions is found; if largest is accurate we
# can stop processing a permutation when its solution is too large.
# I haven't considered how well this can be estimated.
# , the smallest is the size of the largest set of
# data values in a significant field; and the largest is the greater
+of
# the input size or ( sum of significant field sets plus scalar @$sig_
+flds
# minus one.
#
sub size_answer {
my ( $sig_flds, $input, $answer, $freq ) = @_;
my ( $best_possible_size, $worst_possible_size );
# XXX stub
return ( 1, 1000 );
}
# _reduce_input -- Remove duplicate records and cull
# items that are easy to determine are unneeded.
# Put items that are definitely needed into @$part_answer.
# Delete culls from @$input. And build occurrence %$frequency.
sub _reduce_input {
my ( $sig_flds, $input, $part_answer, $freq ) = @_;
local $_;
# create frequency info
for my $fld (@$sig_flds) {
++$freq->{$fld}->{ $_->[$fld] } for @$input;
}
my $start;
do {
return if not defined @$input;
$start = @$input; # progress flag
# sort and remove dupes
@$input = sort { cmp_on_flds( $sig_flds, $freq, $a, $b ) } @$i
+nput;
$input = unique_on_flds( $sig_flds, $input, $freq );
my $progress;
do {
$progress = 0;
$_ = -1;
while ( ref $input && $_ < @$input - 1 ) {
$_++;
if ( moveable( $_, $sig_flds, $input, $freq ) ) {
move( $_, $sig_flds, $input, $part_answer, $freq )
+;
++$progress;
next;
}
if ( removeable( $_, $sig_flds, $input, $freq ) ) {
remove( $_, $sig_flds, $input, $part_answer, $freq
+ );
++$progress;
}
}
} while ($progress);
} while ( ref $input && $start != @$input );
return $input;
}
sub factorial {
my $n = int(shift);
my $ret = $n;
while ( $n > 1 ) {
$ret *= ( $n - 1 );
$n -= 1;
}
return $ret;
}
# return true if rec will fill any possible need in answer
sub useful {
my ( $sig_flds, $curr_rec, $needs ) = @_;
for my $fld (@$sig_flds) {
if ( exists $needs->{$fld}{ $curr_rec->[$fld] } ) {
return 1;
}
}
return;
}
# check a needs/freq hash for emptiness
sub empty {
my ( $sig_flds, $needs ) = @_;
local $_;
die "No tying" if defined tied $needs;
for (@$sig_flds) {
return 1 if "0" ne $needs->{$_};
}
return 0;
}
# move item from input to output
sub move {
my ( $rec, $sig_flds, $input, $answer, $freq ) = @_;
local $, = ' ';
# print "moving @{$input->[$rec]}$/";
for my $fld (@$sig_flds) {
$freq->{$fld}->{ $input->[$rec]->[$fld] } = 0;
}
push @$answer, $input->[$rec];
splice @$input, $rec, 1;
return;
}
# determine if item should go to output
sub moveable {
my ( $rec, $sig_flds, $input, $freq ) = @_;
# is moveable if any fld is unique
for my $fld (@$sig_flds) {
return 1 if 1 == $freq->{$fld}->{ $input->[$rec]->[$fld] };
}
return 0;
}
# remove unnecessary item from input
sub remove {
my ( $rec, $sig_flds, $input, $answer, $freq ) = @_;
local $, = ' ';
# print "removing @{$input->[$rec]}$/";
for my $fld (@$sig_flds) {
if ( $freq->{$fld}->{ $input->[$rec]->[$fld] } != 0 ) {
--$freq->{$fld}->{ $input->[$rec]->[$fld] };
}
}
splice @$input, $rec, 1;
return;
}
# determine if item is unneeded in input
sub removeable {
my ( $rec, $sig_flds, $input, $freq ) = @_;
my $flds_needed = 0;
my $removeable = 0;
my $score = 0;
# removeable if only 1 field needed and that's not unique or if
# no fields needed
for my $fld (@$sig_flds) {
++$flds_needed if $freq->{$fld}->{ $input->[$rec]->[$fld] };
$score += $freq->{$fld}->{ $input->[$rec]->[$fld] };
}
$removeable = 1 if 1 == $flds_needed || 0 == $flds_needed;
return $removeable;
}
# cmp using a \@list_of_field_indices cmp two \@arrays,
# if both records' fields have a frequency of 0 their values don't mat
+ter
sub cmp_on_flds {
my ( $sig_flds, $freq, $aa, $bb ) = @_;
my $ret;
for my $fld (@$sig_flds) {
no warnings 'uninitialized';
if ( 0 == $freq->{$fld}->{ $aa->[$fld] }
&& 0 == $freq->{$fld}->{ $bb->[$fld] } )
{
next;
}
$ret = $aa->[$fld] cmp $bb->[$fld];
return $ret if $ret;
}
return 0;
}
# delete items from a sorted \@array where \@fields_indexed are same
# and adjust frequency info
sub unique_on_flds {
my ( $sig_flds, $data, $freq ) = @_;
my ( $prev, $ret );
while (@$data) {
my $not_dupe = @$sig_flds;
my $curr = shift @$data;
$not_dupe = cmp_on_flds( $sig_flds, $freq, $curr, $prev );
$prev = $curr;
if ($not_dupe) {
push @$ret, $curr;
}
else {
# adjust frequencies
for my $fld (@$sig_flds) {
--$freq->{$fld}->{ $curr->[$fld] };
}
}
}
return $ret;
}
# display scored records
sub display {
my ( $sig_flds, $aref, $freq ) = @_;
local $_;
local $" = ' ';
for my $el (@$aref) {
no warnings 'uninitialized';
for (@$sig_flds) {
printf "%4i %-6s ", $freq->{$_}->{ $el->[$_] }, $el->[$_];
}
print " @$el";
print $/;
}
print $/;
return;
}
# this is from perlmonks.org!Kraythorne to parse input files
sub parse_csv {
my $text = shift;
my $delim = shift;
# not used my $type = undef;
if ( $delim && $delim =~ /comma|tab|pipe|fixed/i ) {
(
# $type,
undef, $delim ) = $delim =~ m/\A (.+) \s - \s (.+) \z/xms;
}
my @new = ();
if ($delim) {
while (
$text =~ m{
# the first part groups the phrase inside the quotes.
# see explanation of this pattern in MRE
"([^\"\\]*(?:\\.[^\"\\]*)*)"$delim?
| ([^$delim]+)$delim?
| $delim
}gx
)
{
if ( defined $+ ) {
push( @new, $+ );
}
else {
push( @new, '' );
}
}
push( @new, '' ) if substr( $text, -1, 1 ) eq ($delim);
}
else {
$new[0] = $text;
}
return @new; # list of values that were comma-separated
}
# permute -- generate an iter yielding all the permutations
# of a list, basically lifted from "Higher Order Perl",
# M.J. Dominus, pp 134-135
sub permute {
my @data = @_;
my $perm = 0;
return sub {
do { $perm++; return @data } if !$perm;
my $i;
my $p = $perm;
for ( $i = 1; $i <= @data && $p % $i == 0; $i++ ) {
$p /= $i;
}
my $d = $p % $i;
my $j = @data - $i;
return if $j < 0;
@data[ $j + 1 .. $#data ] = reverse @data[ $j + 1 .. $#data ];
@data[ $j, $j + $d ] = @data[ $j + $d, $j ];
$perm++;
return @data;
};
}
# partition -- split an array so that each partition shares
# no values per significant field w/ another partition
# example:
# part 1: ( [a,a,a],[b,a,a],[b,c,d],[e,f,d] )
# part 2: ( [f,g,g],[x,g,x],[y,z,x],[z,z,z] )
#
sub partition {
my ( $sig_flds, $input ) = @_;
my $ret;
while (@$input) {
my $part = [];
my $part_has = {};
my $curr = pop @$input;
_add_rec2part( $sig_flds, $curr, $part, $part_has );
my $remainder = $input;
my $next_remainder = [];
while (@$remainder) {
$curr = pop @$remainder;
if ( _part_needs_rec( $sig_flds, $curr, $part, $part_has )
+ ) {
_add_rec2part( $sig_flds, $curr, $part, $part_has );
}
else {
push @$next_remainder, $curr;
}
}
push @$ret, $part;
@$remainder = @$next_remainder;
}
return $ret;
}
# boolean, does the partition need this record
sub _part_needs_rec {
my ( $sig_flds, $rec, $part, $freq ) = @_;
for my $f (@$sig_flds) {
if ( $freq->{$f}->{ $rec->[$f] } ) {
return 1;
}
}
return;
}
# add record to partition and adjust partition's needs hash
sub _add_rec2part {
my ( $sig_flds, $rec, $part, $freq ) = @_;
push @$part, $rec;
for my $fld (@$sig_flds) {
$freq->{$fld}->{ $rec->[$fld] } = 1;
}
return;
}
Be well,