Limbic-Region suggested I spend time comparing that algorithm with his (above) and after consideration I think there is a much better algorithm than the one I proposed above.
On the other hand the algorithm proposed by Limbic-Region does not take full advantage of the histographic information so needs to do a binary search for each smallest deviation item, not to mention a lot of testing for best fit. This, I think can be avoided by ranking values by their distance from the mean before we begin filling the buckets.
The code for a more histographic aware version of Limbic-Region's algorithm is posted below (without correction for relative factorization). The code has been tested on Debian-Etch/Perl 5.8. For demo purposes it prints out the results of allocating 6000 items into 3 buckets for three distributions: (a) a spike with all values are at the mean (b) a symmetrical distribution, i.e. no skew and (c) a seriously skewed distribution.
I've also included a demo run for the original poster's distribution.
BTW, the algorithm below is 0(N) where N=number of items to allocate to buckets. If I'm right about Limbic-Region's strategy being provably optimal, then this problem is very far from NP complete.
use strict;
use warnings;
sub demoAllocation($$$);
demoAllocation
("Distribution: all at mean"
, {a=>1000,b=>2000,c=>3000}
, { '6.0' => 6000 }
);
demoAllocation
("Distribution: unskewed"
, {a=>1000,b=>2000,c=>3000}
, { '3.0' => 300, '4.0' => 600, '5.0' => 700, '5.5' => 900
, '6.0' => 1000
, '6.5' => 900, '7.0' => 700, '8.0' => 600, '9.0' => 300 }
);
demoAllocation
("Distribution: skewed"
, {a=>1000,b=>2000,c=>3000}
, { '3.0' => 4000, '12.0' => 2000 }
);
demoAllocation
("Distribution: Original poster"
, {A=>65, B=>12, C=>24, D=>19, E=>30}
, {'93.8' => 5, '93.81' => 20, '93.82' => 10
, '93.83' => 15, '93.84' => 25, '93.85' => 5
, '93.87'=>20, '94.0' => 5, '94.1' => 35
, '94.2'=> 10 }
);
#------------------------------------------------------------
sub demoAllocation($$$) {
my ($sDescription, $hBuckets, $hFrequency) = @_;
print "$sDescription\n";
my ($dAvg, $hAllocation) = allocate($hBuckets, $hFrequency);
foreach my $sId (sort keys %$hAllocation) {
my $hItems = $hAllocation->{$sId};
my $dSum = 0;
my $iCount = 0;
my ($dBucketAvg, $dDeviation);
print "$sId:";
foreach my $dValue (sort keys %$hItems) {
my $iFreq = $hItems->{$dValue};
printf "\t%s \@ \$%.2f\n", $iFreq, $dValue;
$dSum += $dValue*$iFreq;
$iCount += $iFreq;
}
$dBucketAvg = $dSum/$iCount;
$dDeviation = $dBucketAvg - $dAvg;
printf "\tbucket avg: \$%.2f, deviation: \$%.3f\n\n"
, $dBucketAvg, $dDeviation;
}
print "\n";
}
#------------------------------------------------------------
sub allocate($$) {
my ($hBuckets, $hFrequency) = @_;
#calculate deviations from the mean
my $dAvg=calcWeightedAvg($hFrequency);
my ($iFreqAvg, $aAbove, $aBelow)
= calcDeviations($hFrequency, $dAvg);
#sort buckets by size: smallest first
my @aBuckets = sort { $hBuckets->{$a} <=> $hBuckets->{$b}
} keys %$hBuckets;
#allocate items to buckets, smallest first
my %hAllocations;
my $iFirstAbove = 0;
my $iFirstBelow = 0;
foreach my $sId (@aBuckets) {
my $iSize = $hBuckets->{$sId};
$hAllocations{$sId} = fillBucket($iSize, $dAvg, \$iFreqAvg
, $aAbove, \$iFirstAbove
, $aBelow, \$iFirstBelow);
}
return ($dAvg, \%hAllocations);
}
#------------------------------------------------------------
# SUPPORTING FUNCTIONS - alphabetical order
#------------------------------------------------------------
sub calcDeviations($$) {
my ($hFrequency, $dAvg) = @_;
my @aAbove;
my @aBelow;
my $iFreqAvg = 0;
#calculate deviations from mean
while (my ($dValue,$iFreq) = each(%$hFrequency)) {
if ($dValue == $dAvg) {
$iFreqAvg+=$iFreq;
next;
}
my $dDeviation = $dValue - $dAvg;
if (0 < $dDeviation) {
push @aAbove, [ $dDeviation, $dValue, $iFreq ];
} else {
push @aBelow, [ -$dDeviation, $dValue, $iFreq ];
}
}
#sort with smallest deviations first
return ( $iFreqAvg
, [ sort { compareDeviations($a,$b) } @aAbove ]
, [ sort { compareDeviations($a,$b) } @aBelow ]
);
}
#------------------------------------------------------------
sub compareDeviations($$) {
my ($x, $y) = @_;
return $x->[0] <=> $y->[0];
}
#------------------------------------------------------------
sub calcWeightedAvg($) {
my $hFrequency = shift @_;
my $dSum=0;
my $iCount=0;
while (my ($dValue,$iFreq) = each(%$hFrequency)) {
$dSum+=$dValue*$iFreq;
$iCount+=$iFreq;
}
return $dSum/$iCount;
}
#------------------------------------------------------------
sub fillBucket($$$$$$$) {
my ($iNeeded, $dAvg, $rFreqAvg
, $aAbove, $rFirstAbove
, $aBelow, $rFirstBelow) = @_;
#take items that are at the mean, if we can
if ($iNeeded <= $$rFreqAvg) {
$$rFreqAvg-=$iNeeded;
return { $dAvg => $iNeeded };
}
my $hItems = {};
my $aUp = $aAbove->[$$rFirstAbove];
my $aDown = $aBelow->[$$rFirstBelow];
my $dNetDeviation = 0;
if (0 < $$rFreqAvg) {
$iNeeded -= $$rFreqAvg;
$hItems->{$dAvg} = $$rFreqAvg;
$$rFreqAvg = 0;
}
#take whatever creates the smallest net deviation
# [0] deviation
# [1] value
# [2] frequency
while ($iNeeded > 0) {
my $bUseUp = 0;
if ($aUp) {
if ($aDown) {
my $dNetUp = $dNetDeviation + $aUp->[0];
my $dNetDown = $dNetDeviation - $aDown->[0];
if (abs($dNetUp) < abs($dNetDown)) {
$bUseUp = 1;
$dNetDeviation = $dNetUp;
} else {
$bUseUp = 0;
$dNetDeviation = $dNetDown;
}
} else {
$bUseUp = 1;
}
} else {
$bUseUp = 0;
}
if ($bUseUp) {
$hItems->{$aUp->[1]} ++;
$aUp->[2]--;
$$rFirstAbove++ unless $aUp->[2];
$aUp = $aAbove->[$$rFirstAbove];
} else {
$hItems->{$aDown->[1]} ++;
$aDown->[2]--;
$$rFirstBelow++ unless $aDown->[2];
$aDown = $aBelow->[$$rFirstBelow];
}
$iNeeded--;
}
return $hItems;
}