#!/usr/bin/perl use warnings; use strict; use FileHandle; use Clone 'clone'; use Data::Compare; $|++; my @files = qw( File1 File2 File3 ); my $sig_flds = [ 0 .. 2 ]; # config, the fields that we care about @$sig_flds = sort @$sig_flds; my $answer = []; my $freq = {}; my $input; # AoA ref # put data into an array for my $file (@files) { my $fh = FileHandle->new( $file, 'r' ) or die "Cannot open $file"; scalar <$fh>; # eat header line my $data = [<$fh>]; chomp @$data; $_ = [ parse_csv( $_, ',' ) ] for @$data; close $fh or die "Cannot close $file"; push @$input, @$data; } my $sanity_check = make_ck_list( $sig_flds, $input); sub make_ck_list { my ( $sig_flds, $input) = @_; my %answer_check; for my $rec ( @$input ) { for my $fld ( @$sig_flds ) { $answer_check{ $fld }{ $rec->[$fld] } = 1; } } return \%answer_check; } print "Orig unscored input size(", scalar(@$input), "):$/"; display( $sig_flds, $input, $freq ); sample( $sig_flds, $input, $answer, $freq ); print "Solution size(", scalar(@$answer), "):$/"; display( $sig_flds, $answer, $freq ); my $answer_list = make_ck_list( $sig_flds, $answer); print "Bad answer$/" unless Compare( $answer_list, $sanity_check ); exit; # create a smallest possible sample with all values represented. sub sample { my ( $sig_flds, $input, $answer, $freq ) = @_; local $_; my $part_answer = []; $input = _reduce_input( $sig_flds, $input, $part_answer, $freq ); # are we lucky/done ? return do { @$answer = @$part_answer; return } if !defined $input; # reassemble data w/o duplicates @$input = ( @$input, @$part_answer ); # partition into separate problems my $partition = partition( $sig_flds, $input ); # solve each problem, dumping all the solutions together # process each partition my $ct; for my $part (@$partition) { # print "Starting partition", ++$ct, $/; $freq = {}; $part_answer = []; # this does a little extra work this time $input = _reduce_input( $sig_flds, $part, $part_answer, $freq ); solve_part( $sig_flds, $input, $part_answer, $freq ) if defined $input; push @$answer, @$part_answer; $input = []; } return; } # solve_part -- solve a partition of data by going # through all permutations until a good solution # is found sub solve_part { my ( $sig_flds, $input, $answer, $freq ) = @_; $answer = [] if not defined $answer; my $part_answer = $answer; # factor to predict size of solution my ( $best_possible_size, $worst_possible_size ) = size_answer( $sig_flds, $input, $answer, $freq ); # master needs list; my %master_needs = %$freq; # delete @$part_answer items from needs for my $rec (@$part_answer) { for my $fld (@$sig_flds) { if ( exists $master_needs{$fld}->{ $rec->[$fld] } ) { delete $master_needs{$fld}->{ $rec->[$fld] }; } } } my $iter_perms = permute(@$input); my $cur_best_size = $worst_possible_size; my $best_answer; my $total_perms = factorial( scalar @$input ); my $ct; while ( my @perm = $iter_perms->() ) { ++$ct; my $curr_ans; my $needs = clone( \%master_needs ); @$curr_ans = @$part_answer; while (@perm) { my $curr_rec = pop @perm; if ( useful( $sig_flds, $curr_rec, $needs ) ) { push @$curr_ans, $curr_rec; for my $fld (@$sig_flds) { delete $needs->{$fld}{ $curr_rec->[$fld] }; } } } if ( $ct % 1000 == 0 ) { # XXX #print # " Permutation: $ct/$total_perms " # . "target size: $best_possible_size best $cur_best_size$/"; } if ( empty( $sig_flds, $needs ) ) { if ( @$curr_ans < $cur_best_size ) { $best_answer = $curr_ans; $cur_best_size = @$curr_ans; #print "$/Permutation $ct$/"; #print "We have a new contender: "; #print "Target size: $best_possible_size "; #print "Contender's size: $cur_best_size$/"; if ( $cur_best_size <= $best_possible_size ) { @$answer = @$best_answer; return; } #display( $sig_flds, $best_answer, $freq ); } } } @$answer = @$best_answer; return; } # size_answer -- look at the data and determine: # 1. the smallest possible size of an answer (or less if we must guess), # 2. the largest possible size of an answer (or more if we must guess), # This is an optimization: if smallest is accurate, we can break off # when one of the best solutions is found; if largest is accurate we # can stop processing a permutation when its solution is too large. # I haven't considered how well this can be estimated. # , the smallest is the size of the largest set of # data values in a significant field; and the largest is the greater of # the input size or ( sum of significant field sets plus scalar @$sig_flds # minus one. # sub size_answer { my ( $sig_flds, $input, $answer, $freq ) = @_; my ( $best_possible_size, $worst_possible_size ); # XXX stub return ( 1, 1000 ); } # _reduce_input -- Remove duplicate records and cull # items that are easy to determine are unneeded. # Put items that are definitely needed into @$part_answer. # Delete culls from @$input. And build occurrence %$frequency. sub _reduce_input { my ( $sig_flds, $input, $part_answer, $freq ) = @_; local $_; # create frequency info for my $fld (@$sig_flds) { ++$freq->{$fld}->{ $_->[$fld] } for @$input; } my $start; do { return if not defined @$input; $start = @$input; # progress flag # sort and remove dupes @$input = sort { cmp_on_flds( $sig_flds, $freq, $a, $b ) } @$input; $input = unique_on_flds( $sig_flds, $input, $freq ); my $progress; do { $progress = 0; $_ = -1; while ( ref $input && $_ < @$input - 1 ) { $_++; if ( moveable( $_, $sig_flds, $input, $freq ) ) { move( $_, $sig_flds, $input, $part_answer, $freq ); ++$progress; next; } if ( removeable( $_, $sig_flds, $input, $freq ) ) { remove( $_, $sig_flds, $input, $part_answer, $freq ); ++$progress; } } } while ($progress); } while ( ref $input && $start != @$input ); return $input; } sub factorial { my $n = int(shift); my $ret = $n; while ( $n > 1 ) { $ret *= ( $n - 1 ); $n -= 1; } return $ret; } # return true if rec will fill any possible need in answer sub useful { my ( $sig_flds, $curr_rec, $needs ) = @_; for my $fld (@$sig_flds) { if ( exists $needs->{$fld}{ $curr_rec->[$fld] } ) { return 1; } } return; } # check a needs/freq hash for emptiness sub empty { my ( $sig_flds, $needs ) = @_; local $_; die "No tying" if defined tied $needs; for (@$sig_flds) { return 1 if "0" ne $needs->{$_}; } return 0; } # move item from input to output sub move { my ( $rec, $sig_flds, $input, $answer, $freq ) = @_; local $, = ' '; # print "moving @{$input->[$rec]}$/"; for my $fld (@$sig_flds) { $freq->{$fld}->{ $input->[$rec]->[$fld] } = 0; } push @$answer, $input->[$rec]; splice @$input, $rec, 1; return; } # determine if item should go to output sub moveable { my ( $rec, $sig_flds, $input, $freq ) = @_; # is moveable if any fld is unique for my $fld (@$sig_flds) { return 1 if 1 == $freq->{$fld}->{ $input->[$rec]->[$fld] }; } return 0; } # remove unnecessary item from input sub remove { my ( $rec, $sig_flds, $input, $answer, $freq ) = @_; local $, = ' '; # print "removing @{$input->[$rec]}$/"; for my $fld (@$sig_flds) { if ( $freq->{$fld}->{ $input->[$rec]->[$fld] } != 0 ) { --$freq->{$fld}->{ $input->[$rec]->[$fld] }; } } splice @$input, $rec, 1; return; } # determine if item is unneeded in input sub removeable { my ( $rec, $sig_flds, $input, $freq ) = @_; my $flds_needed = 0; my $removeable = 0; my $score = 0; # removeable if only 1 field needed and that's not unique or if # no fields needed for my $fld (@$sig_flds) { ++$flds_needed if $freq->{$fld}->{ $input->[$rec]->[$fld] }; $score += $freq->{$fld}->{ $input->[$rec]->[$fld] }; } $removeable = 1 if 1 == $flds_needed || 0 == $flds_needed; return $removeable; } # cmp using a \@list_of_field_indices cmp two \@arrays, # if both records' fields have a frequency of 0 their values don't matter sub cmp_on_flds { my ( $sig_flds, $freq, $aa, $bb ) = @_; my $ret; for my $fld (@$sig_flds) { no warnings 'uninitialized'; if ( 0 == $freq->{$fld}->{ $aa->[$fld] } && 0 == $freq->{$fld}->{ $bb->[$fld] } ) { next; } $ret = $aa->[$fld] cmp $bb->[$fld]; return $ret if $ret; } return 0; } # delete items from a sorted \@array where \@fields_indexed are same # and adjust frequency info sub unique_on_flds { my ( $sig_flds, $data, $freq ) = @_; my ( $prev, $ret ); while (@$data) { my $not_dupe = @$sig_flds; my $curr = shift @$data; $not_dupe = cmp_on_flds( $sig_flds, $freq, $curr, $prev ); $prev = $curr; if ($not_dupe) { push @$ret, $curr; } else { # adjust frequencies for my $fld (@$sig_flds) { --$freq->{$fld}->{ $curr->[$fld] }; } } } return $ret; } # display scored records sub display { my ( $sig_flds, $aref, $freq ) = @_; local $_; local $" = ' '; for my $el (@$aref) { no warnings 'uninitialized'; for (@$sig_flds) { printf "%4i %-6s ", $freq->{$_}->{ $el->[$_] }, $el->[$_]; } print " @$el"; print $/; } print $/; return; } # this is from perlmonks.org!Kraythorne to parse input files sub parse_csv { my $text = shift; my $delim = shift; # not used my $type = undef; if ( $delim && $delim =~ /comma|tab|pipe|fixed/i ) { ( # $type, undef, $delim ) = $delim =~ m/\A (.+) \s - \s (.+) \z/xms; } my @new = (); if ($delim) { while ( $text =~ m{ # the first part groups the phrase inside the quotes. # see explanation of this pattern in MRE "([^\"\\]*(?:\\.[^\"\\]*)*)"$delim? | ([^$delim]+)$delim? | $delim }gx ) { if ( defined $+ ) { push( @new, $+ ); } else { push( @new, '' ); } } push( @new, '' ) if substr( $text, -1, 1 ) eq ($delim); } else { $new[0] = $text; } return @new; # list of values that were comma-separated } # permute -- generate an iter yielding all the permutations # of a list, basically lifted from "Higher Order Perl", # M.J. Dominus, pp 134-135 sub permute { my @data = @_; my $perm = 0; return sub { do { $perm++; return @data } if !$perm; my $i; my $p = $perm; for ( $i = 1; $i <= @data && $p % $i == 0; $i++ ) { $p /= $i; } my $d = $p % $i; my $j = @data - $i; return if $j < 0; @data[ $j + 1 .. $#data ] = reverse @data[ $j + 1 .. $#data ]; @data[ $j, $j + $d ] = @data[ $j + $d, $j ]; $perm++; return @data; }; } # partition -- split an array so that each partition shares # no values per significant field w/ another partition # example: # part 1: ( [a,a,a],[b,a,a],[b,c,d],[e,f,d] ) # part 2: ( [f,g,g],[x,g,x],[y,z,x],[z,z,z] ) # sub partition { my ( $sig_flds, $input ) = @_; my $ret; while (@$input) { my $part = []; my $part_has = {}; my $curr = pop @$input; _add_rec2part( $sig_flds, $curr, $part, $part_has ); my $remainder = $input; my $next_remainder = []; while (@$remainder) { $curr = pop @$remainder; if ( _part_needs_rec( $sig_flds, $curr, $part, $part_has ) ) { _add_rec2part( $sig_flds, $curr, $part, $part_has ); } else { push @$next_remainder, $curr; } } push @$ret, $part; @$remainder = @$next_remainder; } return $ret; } # boolean, does the partition need this record sub _part_needs_rec { my ( $sig_flds, $rec, $part, $freq ) = @_; for my $f (@$sig_flds) { if ( $freq->{$f}->{ $rec->[$f] } ) { return 1; } } return; } # add record to partition and adjust partition's needs hash sub _add_rec2part { my ( $sig_flds, $rec, $part, $freq ) = @_; push @$part, $rec; for my $fld (@$sig_flds) { $freq->{$fld}->{ $rec->[$fld] } = 1; } return; }