use Statistics::Lite; use strict; use warnings; my @d = # I assume the order as given in the OP is not important: sort { \$a <=> \$b } ( 12, 14, 16, 18, 18, 20, 20, 20, 20, 20, 20, 20, 22, 24, 24, 24, 24, 30, 30, 30, 32, 35, 35, 35, 35, 35, 35, 35, 36, 40, 42, 46, 48, 48, 50, 50, 50, 50, 54, 54, 55, 56, 56, 58, 58, 60, 60, 60, 60, 63, 64, 67, 67, 68, 70, 70, 76, 80, 86, 86, 86, 90, 90, 99, 100, 100, 100, 100, 104, 105, 128, 150, 150, 154, 169, 190, 200, 200, 200, 250, 280, 291, 291, 300, 325, 325, 330, 450, 460, 550, 566, 600, 700, 750, 770, 950, 1226, 1250, 2000, 15, 22, 24 ); my %count; \$count{\$_}++ for @d; my @e = sort { \$a <=> \$b } keys %count; # an individual is an array of arrays of nums. # each element of the (top-level) array represents a cluster. # the order of all the numbers, if you were to concat all the arrays, # is strictly numeric ascending. # in a 1-d space only, it never makes sense to cluster the # numbers (1,2,3,4) as (1,3), (2,4). # the only mutation possible is shifting a number off the # beggining of one array and pushing it onto the previous array. # (and the other way). sub random_bipartition { my( \$min_size, \$ar ) = @_; my \$sel = @\$ar - (2*\$min_size); my \$p = \$min_size + int rand( \$sel ); \$p > \$#{\$ar} ? ( [ @{\$ar} ], [ ] ) : ( [ @{\$ar}[0 .. \$p] ], [ @{\$ar}[\$p+1 .. \$#{\$ar}] ] ) } sub Ind::new_randomized { my \$nc = shift; my \$n = \$nc <= 2 ? 1 : \$nc <= 4 ? 2 : \$nc <= 8 ? 3 : 4; # 16 max my @a = ( \@e ); @a = map { random_bipartition( 1<<\$n, \$_ ) } @a while \$n--; # deep magic :-) my \$i = 0; while ( @a > \$nc ) { unshift @{\$a[\$i+1]}, @{\$a[\$i]}; splice @a, \$i, 1; \$i++; } \@a } sub Ind::clone { my \$ind = shift; [ map { [@\$_] } @\$ind ] } sub Ind::as_string { my \$ind = shift; my \$varsum; my \$s; for my \$cl ( @\$ind ) { my \$var = 0; if ( @\$cl ) { my @d; push @d, (\$_) x \$count{\$_} for @\$cl; \$var = int( Statistics::Lite::variance(@d)||0 ); \$varsum += \$var; } \$s .= "\$var ( @\$cl )\n"; } \$s .= "Total variance: \$varsum\n"; \$s } sub Ind::fitness { my \$ind = shift; my \$sum = 0; my \$empty = 0; for my \$cl ( @\$ind ) { if ( @\$cl ) { my @d; push @d, (\$_) x \$count{\$_} for @\$cl; \$sum += (Statistics::Lite::variance(@d)||0); } else { \$empty++; } } # harshly penalize individuals with the wrong number (too few) of clusters: \$sum *= ( 1 + \$empty / 10 ); 1_000_000 - \$sum # convert to Larger Is Better } sub Ind::mutate { my( \$ind, \$n ) = @_; \$n ||= 1; my \$lo = int rand( @\$ind - 1); if ( int rand 2 ) { # up my(\$to,\$from) = ( \$ind->[\$lo+1], \$ind->[\$lo] ); unshift @\$to, pop @\$from while \$n-- && @\$from } else { # down my(\$to,\$from) = ( \$ind->[\$lo], \$ind->[\$lo+1] ); push @\$to, shift @\$from while \$n-- && @\$from } \$ind } # this clones an element of @pop sub clone { [ \$_[0]->[0], Ind::clone(\$_[0]->[1]) ] } sub do_run { my \$n_clusters = shift; my @pop = sort { \$b->[0] <=> \$a->[0] } map { [ Ind::fitness(\$_), \$_ ] } map { Ind::new_randomized(\$n_clusters) } 1 .. 60; for my \$gen ( 1 .. 200 ) { # kill the bottom 30: splice @pop, @pop-30, 30; # make 10 new ones: push @pop, map { [ Ind::fitness(\$_), \$_ ] } map { Ind::new_randomized(\$n_clusters) } 1 .. 10; # clone the top 20: push @pop, map clone(\$_), @pop[0 .. 19]; # mutate the top 20: for my \$e ( @pop[0 .. 19] ) { Ind::mutate( \$e->[1], 1 + int rand 4 ); \$e->[0] = Ind::fitness( \$e->[1] ); } # sort by fitness again: @pop = sort { \$b->[0] <=> \$a->[0] } @pop; } \$pop[0][1] # best individual } for my \$nc ( 2 .. 8 ) { print "\n\$nc Clusters:\n"; my \$winner = do_run( \$nc ); print Ind::as_string( \$winner ), "\n"; }