Making sense of data: Clustering OR A coding challenge

K-means is not esoteric within the clustering community; in fact, it is the first method people turn to because it is easy to implement. Unfortunately, it is also among the weakest of the adaptive clustering methods. It gets stuck in local minima easily, as this program will show if you run it multiple times:
```#!/usr/bin/perl

use warnings;
use strict;

my \$num_clust = 4; # number of clusters
my \$tol = 0.001;    # stopping tolerance

# my @data = map {rand} 1..100;
my @data = (0.1, 0.15, 0.3, 0.35, 0.5, 0.55, 0.7, 0.75);

# initialize by choosing random points the data
my @center = @data[ map {rand @data} 1..\$num_clust ];

my \$diff;
do {
\$diff = 0;

# Assign points to nearest center
my @cluster;
foreach my \$point (@data) {
my \$closest = 0;
my \$dist = abs \$point - \$center[ \$closest ];
for my \$idx (1..\$#center) {
if (abs \$point - \$center[ \$idx ] < \$dist) {
\$dist = abs \$point - \$center[ \$idx ];
\$closest = \$idx;
}
}
push @cluster, [\$point, \$closest];
}

# compute new centers
foreach my \$center_idx (0..\$#center) {
my @members = grep {\$_->[1] == \$center_idx} @cluster;
my \$sum = 0;
foreach my \$member (@members) {
\$sum += \$member->[0];
}
my \$new_center = @members ? \$sum / @members : \$center[ \$center
+_idx ];
\$diff += abs \$center[ \$center_idx ] - \$new_center;
\$center[ \$center_idx ] = \$new_center;
}

} while (\$diff > \$tol);

print "Centers are:\n";
foreach my \$center_idx (0..\$#center) {
print "\$center_idx    \$center[ \$center_idx ]\n";
}

-Mark

Re^2: Making sense of data: Clustering OR A coding challenge
by mahesh557 (Novice) on Jun 07, 2016 at 19:33 UTC

The algorithm is generating some FalsePositives, hence added Fine Tuning through iterations

```use warnings;
use strict;
use Data::Dumper;

#my @data = map {rand} 1..100;
my @dt = (1,2,3,40,40,40,40,42,43,45,80,85,90,91,91,91,91,4,9,10);
my @clustercenters =  getClusterCenters(3,@dt);
@clustercenters = sort { \$a <=> \$b } @clustercenters;
my (\$low, \$medium, \$high) = @clustercenters;
my %tags = ( \$low => "low",
\$medium => "medium",
\$high =>"high",
);

print ("\n\n \$low \t\$medium \t\$high\n");
print "\nclosest(12): ", \$tags{ closest(12, @clustercenters) };
print "\nclosest(43): ", \$tags{ closest(43, @clustercenters) };
print "\n";

sub closest {
my (\$val,@arr) = @_;
my @list = sort { abs(\$a - \$val) <=> abs(\$b - \$val) } @arr;
return \$list[0];
}

sub getClusterCenters{
my (\$n, @data) = @_;
my \$iter = 4;
my @centers = ();

for (1..\$iter){
my @clustercenters =  get1DClusterCenters(\$n,@data);
@clustercenters = sort { \$a <=> \$b } @clustercenters;
print "\n",join("\t", @clustercenters);
my @tcenters = @clustercenters;

for(my \$i=0; \$i <= \$#clustercenters; \$i++){   \$centers[\$i] +=
+\$clustercenters[\$i];   }

}

print "\n",join("\t", @centers );
@centers = map { \$_ = \$_ / \$iter; } @centers;

return @centers;
}

# It takes a 1D array of values and returns centers of clusters sorted
sub get1DClusterCenters{
my (\$num_clust, @data) = @_;

my \$tol = 0.001;    # stopping tolerance

# initialize by choosing random points the data
my @center = @data[ map {rand @data} 1..\$num_clust ];

my \$diff;
my @members;
my @cluster;

do {
\$diff = 0;

# Assign points to nearest center
my @cluster;
foreach my \$point (@data) {
my \$closest = 0;
my \$dist = abs \$point - \$center[ \$closest ];
for my \$idx (1..\$#center) {
if (abs \$point - \$center[ \$idx ] < \$dist) {
\$dist = abs \$point - \$center[ \$idx ];
\$closest = \$idx;
}
}
push @cluster, [\$point, \$closest];
}

# compute new centers
foreach my \$center_idx (0..\$#center) {
@members = grep {\$_->[1] == \$center_idx} @cluster;
my \$sum = 0;
#  print "\n\n** group \$center_idx \n";
foreach my \$member (@members) {
#     print "\t ",\$member->[0];
\$sum += \$member->[0];
}
my \$new_center = @members ? \$sum / @members : \$center[ \$ce
+nter_idx ];
\$diff += abs \$center[ \$center_idx ] - \$new_center;
\$center[ \$center_idx ] = \$new_center;
}

} while (\$diff > \$tol);

#print "Centers are:\n";
my @cluster_means = ();

foreach my \$center_idx (0..\$#center) {
#print "\n\$center_idx    \$center[ \$center_idx ]\n";
push (@cluster_means, int(\$center[ \$center_idx ]) );
}
@cluster_means = sort { \$a <=> \$b } @cluster_means;

# print "\nCLUSTER MEANS: ", join(",", @cluster_means);
return @cluster_means;

}

