in reply to
Making sense of data: Clustering OR A coding challenge
K-means is not esoteric within the clustering community; in fact, it is the first method people turn to because it is easy to implement. Unfortunately, it is also among the weakest of the adaptive clustering methods. It gets stuck in local minima easily, as this program will show if you run it multiple times:
#!/usr/bin/perl
use warnings;
use strict;
my $num_clust = 4; # number of clusters
my $tol = 0.001; # stopping tolerance
# my @data = map {rand} 1..100;
my @data = (0.1, 0.15, 0.3, 0.35, 0.5, 0.55, 0.7, 0.75);
# initialize by choosing random points the data
my @center = @data[ map {rand @data} 1..$num_clust ];
my $diff;
do {
$diff = 0;
# Assign points to nearest center
my @cluster;
foreach my $point (@data) {
my $closest = 0;
my $dist = abs $point - $center[ $closest ];
for my $idx (1..$#center) {
if (abs $point - $center[ $idx ] < $dist) {
$dist = abs $point - $center[ $idx ];
$closest = $idx;
}
}
push @cluster, [$point, $closest];
}
# compute new centers
foreach my $center_idx (0..$#center) {
my @members = grep {$_->[1] == $center_idx} @cluster;
my $sum = 0;
foreach my $member (@members) {
$sum += $member->[0];
}
my $new_center = @members ? $sum / @members : $center[ $center
+_idx ];
$diff += abs $center[ $center_idx ] - $new_center;
$center[ $center_idx ] = $new_center;
}
} while ($diff > $tol);
print "Centers are:\n";
foreach my $center_idx (0..$#center) {
print "$center_idx $center[ $center_idx ]\n";
}