use Statistics::Lite; use strict; use warnings; my @d = # I assume the order as given in the OP is not important: sort { $a <=> $b } ( 12, 14, 16, 18, 18, 20, 20, 20, 20, 20, 20, 20, 22, 24, 24, 24, 24, 30, 30, 30, 32, 35, 35, 35, 35, 35, 35, 35, 36, 40, 42, 46, 48, 48, 50, 50, 50, 50, 54, 54, 55, 56, 56, 58, 58, 60, 60, 60, 60, 63, 64, 67, 67, 68, 70, 70, 76, 80, 86, 86, 86, 90, 90, 99, 100, 100, 100, 100, 104, 105, 128, 150, 150, 154, 169, 190, 200, 200, 200, 250, 280, 291, 291, 300, 325, 325, 330, 450, 460, 550, 566, 600, 700, 750, 770, 950, 1226, 1250, 2000, 15, 22, 24 ); my %count; $count{$_}++ for @d; my @e = sort { $a <=> $b } keys %count; # an individual is an array of arrays of nums. # each element of the (top-level) array represents a cluster. # the order of all the numbers, if you were to concat all the arrays, # is strictly numeric ascending. # in a 1-d space only, it never makes sense to cluster the # numbers (1,2,3,4) as (1,3), (2,4). # the only mutation possible is shifting a number off the # beggining of one array and pushing it onto the previous array. # (and the other way). sub random_bipartition { my( $min_size, $ar ) = @_; my $sel = @$ar - (2*$min_size); my $p = $min_size + int rand( $sel ); $p > $#{$ar} ? ( [ @{$ar} ], [ ] ) : ( [ @{$ar}[0 .. $p] ], [ @{$ar}[$p+1 .. $#{$ar}] ] ) } sub Ind::new_randomized { my $nc = shift; my $n = $nc <= 2 ? 1 : $nc <= 4 ? 2 : $nc <= 8 ? 3 : 4; # 16 max my @a = ( \@e ); @a = map { random_bipartition( 1<<$n, $_ ) } @a while $n--; # deep magic :-) my $i = 0; while ( @a > $nc ) { unshift @{$a[$i+1]}, @{$a[$i]}; splice @a, $i, 1; $i++; } \@a } sub Ind::clone { my $ind = shift; [ map { [@$_] } @$ind ] } sub Ind::as_string { my $ind = shift; my $varsum; my $s; for my $cl ( @$ind ) { my $var = 0; if ( @$cl ) { my @d; push @d, ($_) x $count{$_} for @$cl; $var = int( Statistics::Lite::variance(@d)||0 ); $varsum += $var; } $s .= "$var ( @$cl )\n"; } $s .= "Total variance: $varsum\n"; $s } sub Ind::fitness { my $ind = shift; my $sum = 0; my $empty = 0; for my $cl ( @$ind ) { if ( @$cl ) { my @d; push @d, ($_) x $count{$_} for @$cl; $sum += (Statistics::Lite::variance(@d)||0); } else { $empty++; } } # harshly penalize individuals with the wrong number (too few) of clusters: $sum *= ( 1 + $empty / 10 ); 1_000_000 - $sum # convert to Larger Is Better } sub Ind::mutate { my( $ind, $n ) = @_; $n ||= 1; my $lo = int rand( @$ind - 1); if ( int rand 2 ) { # up my($to,$from) = ( $ind->[$lo+1], $ind->[$lo] ); unshift @$to, pop @$from while $n-- && @$from } else { # down my($to,$from) = ( $ind->[$lo], $ind->[$lo+1] ); push @$to, shift @$from while $n-- && @$from } $ind } # this clones an element of @pop sub clone { [ $_[0]->[0], Ind::clone($_[0]->[1]) ] } sub do_run { my $n_clusters = shift; my @pop = sort { $b->[0] <=> $a->[0] } map { [ Ind::fitness($_), $_ ] } map { Ind::new_randomized($n_clusters) } 1 .. 60; for my $gen ( 1 .. 200 ) { # kill the bottom 30: splice @pop, @pop-30, 30; # make 10 new ones: push @pop, map { [ Ind::fitness($_), $_ ] } map { Ind::new_randomized($n_clusters) } 1 .. 10; # clone the top 20: push @pop, map clone($_), @pop[0 .. 19]; # mutate the top 20: for my $e ( @pop[0 .. 19] ) { Ind::mutate( $e->[1], 1 + int rand 4 ); $e->[0] = Ind::fitness( $e->[1] ); } # sort by fitness again: @pop = sort { $b->[0] <=> $a->[0] } @pop; } $pop[0][1] # best individual } for my $nc ( 2 .. 8 ) { print "\n$nc Clusters:\n"; my $winner = do_run( $nc ); print Ind::as_string( $winner ), "\n"; }