#!/usr/bin/perl
use warnings;
use strict;
use FileHandle;
use Clone 'clone';
use Data::Compare;
$|++;

my @files = qw( File1 File2 File3 );

my $sig_flds = [ 0 .. 2 ];  # config, the fields that we care about

@$sig_flds = sort @$sig_flds;

my $answer = [];
my $freq   = {};


my $input;                  # AoA ref

# put data into an array
for my $file (@files) {
    my $fh = FileHandle->new( $file, 'r' ) or die "Cannot open $file";
    scalar <$fh>;           # eat header line
    my $data = [<$fh>];
    chomp @$data;
    $_ = [ parse_csv( $_, ',' ) ] for @$data;
    close $fh or die "Cannot close $file";
    push @$input, @$data;
}

my $sanity_check = make_ck_list( $sig_flds, $input);

sub make_ck_list {
    my ( $sig_flds, $input) = @_;
    my %answer_check;

    for my $rec ( @$input ) {
        for my $fld ( @$sig_flds ) {
            $answer_check{ $fld }{ $rec->[$fld] } = 1;
        }
    }
    return \%answer_check;
}

print "Orig unscored input size(", scalar(@$input), "):$/";
display( $sig_flds, $input, $freq );

sample( $sig_flds, $input, $answer, $freq );

print "Solution size(", scalar(@$answer), "):$/";
display( $sig_flds, $answer, $freq );

my $answer_list = make_ck_list( $sig_flds, $answer);

print "Bad answer$/" unless Compare( $answer_list, $sanity_check );


exit;


# create a smallest possible sample with all values represented.
sub sample {
    my ( $sig_flds, $input, $answer, $freq ) = @_;
    local $_;
    my $part_answer = [];

    $input = _reduce_input( $sig_flds, $input, $part_answer, $freq );

    # are we lucky/done ? return
    do { @$answer = @$part_answer; return } if !defined $input;

    # reassemble data w/o duplicates
    @$input = ( @$input, @$part_answer );

    # partition into separate problems
    my $partition = partition( $sig_flds, $input );

    # solve each problem, dumping all the solutions together

    # process each partition
    my $ct;
    for my $part (@$partition) {
        # print "Starting partition", ++$ct, $/;
        $freq        = {};
        $part_answer = [];

        # this does a little extra work this time
        $input = _reduce_input( $sig_flds, $part, $part_answer, $freq );

        solve_part( $sig_flds, $input, $part_answer, $freq )
            if defined $input;
        push @$answer, @$part_answer;
        $input = [];
    }
    return;
}

# solve_part -- solve a partition of data by going
#   through all permutations until a good solution
#   is found
sub solve_part {
    my ( $sig_flds, $input, $answer, $freq ) = @_;

    $answer = [] if not defined $answer;
    my $part_answer = $answer;

    # factor to predict size of solution
    my ( $best_possible_size, $worst_possible_size )
        = size_answer( $sig_flds, $input, $answer, $freq );

    # master needs list;
    my %master_needs = %$freq;

    # delete @$part_answer items from needs
    for my $rec (@$part_answer) {
        for my $fld (@$sig_flds) {
            if ( exists $master_needs{$fld}->{ $rec->[$fld] } ) {
                delete $master_needs{$fld}->{ $rec->[$fld] };
            }
        }
    }

    my $iter_perms = permute(@$input);

    my $cur_best_size = $worst_possible_size;
    my $best_answer;

    my $total_perms = factorial( scalar @$input );

    my $ct;
    while ( my @perm = $iter_perms->() ) {
        ++$ct;

        my $curr_ans;
        my $needs = clone( \%master_needs );

        @$curr_ans = @$part_answer;

        while (@perm) {


            my $curr_rec = pop @perm;

            if ( useful( $sig_flds, $curr_rec, $needs ) ) {
                push @$curr_ans, $curr_rec;
                for my $fld (@$sig_flds) {
                    delete $needs->{$fld}{ $curr_rec->[$fld] };
                }
            }
        }
        if ( $ct % 1000 == 0 ) {  # XXX
            #print
            #    " Permutation: $ct/$total_perms "
            #    . "target size: $best_possible_size best $cur_best_size$/";
        }
        if ( empty( $sig_flds, $needs ) ) {
            if ( @$curr_ans < $cur_best_size ) {
                $best_answer   = $curr_ans;
                $cur_best_size = @$curr_ans;
                #print "$/Permutation $ct$/";
                #print "We have a new contender: ";
                #print "Target size: $best_possible_size  ";
                #print "Contender's size: $cur_best_size$/";

                if ( $cur_best_size <= $best_possible_size ) {
                    @$answer = @$best_answer;
                    return;
                }
                #display( $sig_flds, $best_answer, $freq );
            }
        }
    }
    @$answer = @$best_answer;
    return;
}


# size_answer -- look at the data and determine:
#  1. the smallest possible size of an answer (or less if we must guess),
#  2. the largest possible size of an answer (or more if we must guess),
#  This is an optimization: if smallest is accurate, we can break off
#  when one of the best solutions is found; if largest is accurate we
#  can stop processing a permutation when its solution is too large.
#  I haven't considered how well this can be estimated.
#  , the smallest is the size of the largest set of
#  data values in a significant field; and the largest is the greater of
# the input size or ( sum of significant field sets plus scalar @$sig_flds
# minus one.
#
sub size_answer {
    my ( $sig_flds, $input, $answer, $freq ) = @_;
    my ( $best_possible_size, $worst_possible_size );

    # XXX stub
    return ( 1, 1000 );

}

# _reduce_input -- Remove duplicate records and cull
#   items that are easy to determine are unneeded.
#   Put items that are definitely needed into @$part_answer.
#   Delete culls from @$input.  And build occurrence %$frequency.
sub _reduce_input {
    my ( $sig_flds, $input, $part_answer, $freq ) = @_;
    local $_;

    # create frequency info
    for my $fld (@$sig_flds) {
        ++$freq->{$fld}->{ $_->[$fld] } for @$input;
    }

    my $start;
    do {
        return if not defined @$input;
        $start = @$input;    # progress flag

        # sort and remove dupes
        @$input = sort { cmp_on_flds( $sig_flds, $freq, $a, $b ) } @$input;
        $input = unique_on_flds( $sig_flds, $input, $freq );

        my $progress;
        do {
            $progress = 0;
            $_        = -1;
            while ( ref $input && $_ < @$input - 1 ) {
                $_++;
                if ( moveable( $_, $sig_flds, $input, $freq ) ) {
                    move( $_, $sig_flds, $input, $part_answer, $freq );
                    ++$progress;
                    next;
                }
                if ( removeable( $_, $sig_flds, $input, $freq ) ) {
                    remove( $_, $sig_flds, $input, $part_answer, $freq );
                    ++$progress;
                }
            }

        } while ($progress);

    } while ( ref $input && $start != @$input );

    return $input;
}

sub factorial {
    my $n   = int(shift);
    my $ret = $n;
    while ( $n > 1 ) {
        $ret *= ( $n - 1 );
        $n -= 1;
    }
    return $ret;
}

# return true if rec will fill any possible need in answer
sub useful {
    my ( $sig_flds, $curr_rec, $needs ) = @_;
    for my $fld (@$sig_flds) {
        if ( exists $needs->{$fld}{ $curr_rec->[$fld] } ) {
            return 1;
        }
    }
    return;
}

# check a needs/freq hash for emptiness
sub empty {
    my ( $sig_flds, $needs ) = @_;
    local $_;
    die "No tying" if defined tied $needs;
    for (@$sig_flds) {
        return 1 if "0" ne $needs->{$_};
    }
    return 0;
}

# move item from input to output
sub move {
    my ( $rec, $sig_flds, $input, $answer, $freq ) = @_;

    local $, = ' ';

    # print "moving  @{$input->[$rec]}$/";

    for my $fld (@$sig_flds) {
        $freq->{$fld}->{ $input->[$rec]->[$fld] } = 0;
    }
    push @$answer, $input->[$rec];
    splice @$input, $rec, 1;
    return;
}
# determine if item should go to output
sub moveable {
    my ( $rec, $sig_flds, $input, $freq ) = @_;

    # is moveable if any fld is unique
    for my $fld (@$sig_flds) {
        return 1 if 1 == $freq->{$fld}->{ $input->[$rec]->[$fld] };
    }
    return 0;
}

# remove unnecessary item from input
sub remove {
    my ( $rec, $sig_flds, $input, $answer, $freq ) = @_;

    local $, = ' ';

    # print "removing  @{$input->[$rec]}$/";

    for my $fld (@$sig_flds) {
        if ( $freq->{$fld}->{ $input->[$rec]->[$fld] } != 0 ) {
            --$freq->{$fld}->{ $input->[$rec]->[$fld] };
        }
    }

    splice @$input, $rec, 1;
    return;
}

# determine if item is unneeded in input
sub removeable {
    my ( $rec, $sig_flds, $input, $freq ) = @_;
    my $flds_needed = 0;
    my $removeable  = 0;
    my $score       = 0;

    # removeable if only 1 field needed and that's not unique or if
    # no fields needed
    for my $fld (@$sig_flds) {
        ++$flds_needed if $freq->{$fld}->{ $input->[$rec]->[$fld] };
        $score += $freq->{$fld}->{ $input->[$rec]->[$fld] };
    }
    $removeable = 1 if 1 == $flds_needed || 0 == $flds_needed;

    return $removeable;
}

# cmp using a \@list_of_field_indices cmp two \@arrays,
# if both records' fields have a frequency of 0 their values don't matter
sub cmp_on_flds {
    my ( $sig_flds, $freq, $aa, $bb ) = @_;

    my $ret;
    for my $fld (@$sig_flds) {
        no warnings 'uninitialized';
        if (   0 == $freq->{$fld}->{ $aa->[$fld] }
            && 0 == $freq->{$fld}->{ $bb->[$fld] } )
        {
            next;
        }
        $ret = $aa->[$fld] cmp $bb->[$fld];
        return $ret if $ret;
    }
    return 0;
}

# delete items from a sorted \@array where \@fields_indexed are same
# and adjust frequency info
sub unique_on_flds {
    my ( $sig_flds, $data, $freq ) = @_;
    my ( $prev,     $ret );

    while (@$data) {
        my $not_dupe = @$sig_flds;
        my $curr     = shift @$data;
        $not_dupe = cmp_on_flds( $sig_flds, $freq, $curr, $prev );

        $prev = $curr;
        if ($not_dupe) {
            push @$ret, $curr;
        }
        else {

            # adjust frequencies
            for my $fld (@$sig_flds) {
                --$freq->{$fld}->{ $curr->[$fld] };
            }
        }
    }
    return $ret;
}

# display scored records
sub display {
    my ( $sig_flds, $aref, $freq ) = @_;
    local $_;
    local $" = ' ';
    for my $el (@$aref) {
        no warnings 'uninitialized';
        for (@$sig_flds) {
            printf "%4i %-6s ", $freq->{$_}->{ $el->[$_] }, $el->[$_];
        }
        print " @$el";
        print $/;
    }
    print $/;
    return;
}

# this is from perlmonks.org!Kraythorne to parse input files
sub parse_csv {

    my $text  = shift;
    my $delim = shift;

    # not used    my $type  = undef;
    if ( $delim && $delim =~ /comma|tab|pipe|fixed/i ) {
        (

            # $type,
            undef, $delim ) = $delim =~ m/\A (.+) \s - \s (.+) \z/xms;
    }

    my @new = ();
    if ($delim) {
        while (
            $text =~ m{
        # the first part groups the phrase inside the quotes.
        # see explanation of this pattern in MRE
        "([^\"\\]*(?:\\.[^\"\\]*)*)"$delim?
           |  ([^$delim]+)$delim?
           | $delim
        }gx
            )
        {
            if ( defined $+ ) {
                push( @new, $+ );
            }
            else {
                push( @new, '' );
            }
        }
        push( @new, '' ) if substr( $text, -1, 1 ) eq ($delim);
    }
    else {
        $new[0] = $text;
    }
    return @new;    # list of values that were comma-separated
}

# permute -- generate an iter yielding all the permutations
# of a list, basically lifted from "Higher Order Perl",
# M.J. Dominus, pp 134-135

sub permute {
    my @data = @_;
    my $perm = 0;
    return sub {
        do { $perm++; return @data } if !$perm;
        my $i;
        my $p = $perm;
        for ( $i = 1; $i <= @data && $p % $i == 0; $i++ ) {
            $p /= $i;
        }
        my $d = $p % $i;
        my $j = @data - $i;
        return if $j < 0;

        @data[ $j + 1 .. $#data ] = reverse @data[ $j + 1 .. $#data ];
        @data[ $j, $j + $d ] = @data[ $j + $d, $j ];

        $perm++;
        return @data;
    };
}


# partition -- split an array so that each partition shares
#      no values per significant field w/ another partition
#      example:
#         part 1: ( [a,a,a],[b,a,a],[b,c,d],[e,f,d] )
#         part 2: ( [f,g,g],[x,g,x],[y,z,x],[z,z,z] )
#
sub partition {
    my ( $sig_flds, $input ) = @_;
    my $ret;

    while (@$input) {
        my $part     = [];
        my $part_has = {};
        my $curr     = pop @$input;

        _add_rec2part( $sig_flds, $curr, $part, $part_has );

        my $remainder      = $input;
        my $next_remainder = [];

        while (@$remainder) {
            $curr = pop @$remainder;

            if ( _part_needs_rec( $sig_flds, $curr, $part, $part_has ) ) {

                _add_rec2part( $sig_flds, $curr, $part, $part_has );
            }
            else {
                push @$next_remainder, $curr;
            }
        }
        push @$ret, $part;
        @$remainder = @$next_remainder;
    }
    return $ret;
}

# boolean, does the partition need this record
sub _part_needs_rec {
    my ( $sig_flds, $rec, $part, $freq ) = @_;

    for my $f (@$sig_flds) {
        if ( $freq->{$f}->{ $rec->[$f] } ) {
            return 1;
        }
    }
    return;
}

# add record to partition and adjust partition's needs hash
sub _add_rec2part {
    my ( $sig_flds, $rec, $part, $freq ) = @_;

    push @$part, $rec;

    for my $fld (@$sig_flds) {
        $freq->{$fld}->{ $rec->[$fld] } = 1;
    }
    return;
}