http://www.perlmonks.org?node_id=541005


in reply to Making sense of data: Clustering OR A coding challenge

K-means is not esoteric within the clustering community; in fact, it is the first method people turn to because it is easy to implement. Unfortunately, it is also among the weakest of the adaptive clustering methods. It gets stuck in local minima easily, as this program will show if you run it multiple times:
#!/usr/bin/perl use warnings; use strict; my $num_clust = 4; # number of clusters my $tol = 0.001; # stopping tolerance # my @data = map {rand} 1..100; my @data = (0.1, 0.15, 0.3, 0.35, 0.5, 0.55, 0.7, 0.75); # initialize by choosing random points the data my @center = @data[ map {rand @data} 1..$num_clust ]; my $diff; do { $diff = 0; # Assign points to nearest center my @cluster; foreach my $point (@data) { my $closest = 0; my $dist = abs $point - $center[ $closest ]; for my $idx (1..$#center) { if (abs $point - $center[ $idx ] < $dist) { $dist = abs $point - $center[ $idx ]; $closest = $idx; } } push @cluster, [$point, $closest]; } # compute new centers foreach my $center_idx (0..$#center) { my @members = grep {$_->[1] == $center_idx} @cluster; my $sum = 0; foreach my $member (@members) { $sum += $member->[0]; } my $new_center = @members ? $sum / @members : $center[ $center +_idx ]; $diff += abs $center[ $center_idx ] - $new_center; $center[ $center_idx ] = $new_center; } } while ($diff > $tol); print "Centers are:\n"; foreach my $center_idx (0..$#center) { print "$center_idx $center[ $center_idx ]\n"; }

-Mark

Replies are listed 'Best First'.
Re^2: Making sense of data: Clustering OR A coding challenge
by mahesh557 (Novice) on Jun 07, 2016 at 19:33 UTC

    The algorithm is generating some FalsePositives, hence added Fine Tuning through iterations

    use warnings; use strict; use Data::Dumper; #my @data = map {rand} 1..100; my @dt = (1,2,3,40,40,40,40,42,43,45,80,85,90,91,91,91,91,4,9,10); my @clustercenters = getClusterCenters(3,@dt); @clustercenters = sort { $a <=> $b } @clustercenters; my ($low, $medium, $high) = @clustercenters; my %tags = ( $low => "low", $medium => "medium", $high =>"high", ); print ("\n\n $low \t$medium \t$high\n"); print "\nclosest(12): ", $tags{ closest(12, @clustercenters) }; print "\nclosest(43): ", $tags{ closest(43, @clustercenters) }; print "\n"; sub closest { my ($val,@arr) = @_; my @list = sort { abs($a - $val) <=> abs($b - $val) } @arr; return $list[0]; } sub getClusterCenters{ my ($n, @data) = @_; my $iter = 4; my @centers = (); for (1..$iter){ my @clustercenters = get1DClusterCenters($n,@data); @clustercenters = sort { $a <=> $b } @clustercenters; print "\n",join("\t", @clustercenters); my @tcenters = @clustercenters; for(my $i=0; $i <= $#clustercenters; $i++){ $centers[$i] += +$clustercenters[$i]; } } print "\n",join("\t", @centers ); @centers = map { $_ = $_ / $iter; } @centers; return @centers; } # It takes a 1D array of values and returns centers of clusters sorted sub get1DClusterCenters{ my ($num_clust, @data) = @_; my $tol = 0.001; # stopping tolerance # initialize by choosing random points the data my @center = @data[ map {rand @data} 1..$num_clust ]; my $diff; my @members; my @cluster; do { $diff = 0; # Assign points to nearest center my @cluster; foreach my $point (@data) { my $closest = 0; my $dist = abs $point - $center[ $closest ]; for my $idx (1..$#center) { if (abs $point - $center[ $idx ] < $dist) { $dist = abs $point - $center[ $idx ]; $closest = $idx; } } push @cluster, [$point, $closest]; } # compute new centers foreach my $center_idx (0..$#center) { @members = grep {$_->[1] == $center_idx} @cluster; my $sum = 0; # print "\n\n** group $center_idx \n"; foreach my $member (@members) { # print "\t ",$member->[0]; $sum += $member->[0]; } my $new_center = @members ? $sum / @members : $center[ $ce +nter_idx ]; $diff += abs $center[ $center_idx ] - $new_center; $center[ $center_idx ] = $new_center; } } while ($diff > $tol); #print "Centers are:\n"; my @cluster_means = (); foreach my $center_idx (0..$#center) { #print "\n$center_idx $center[ $center_idx ]\n"; push (@cluster_means, int($center[ $center_idx ]) ); } @cluster_means = sort { $a <=> $b } @cluster_means; # print "\nCLUSTER MEANS: ", join(",", @cluster_means); return @cluster_means; }