Syntactic Confectionery Delight PerlMonks

### Re: Making sense of data: Clustering OR A coding challenge

by kvale (Monsignor)
 on Apr 03, 2006 at 19:32 UTC ( #541005=note: print w/replies, xml ) Need Help??

K-means is not esoteric within the clustering community; in fact, it is the first method people turn to because it is easy to implement. Unfortunately, it is also among the weakest of the adaptive clustering methods. It gets stuck in local minima easily, as this program will show if you run it multiple times:
```#!/usr/bin/perl

use warnings;
use strict;

my \$num_clust = 4; # number of clusters
my \$tol = 0.001;    # stopping tolerance

# my @data = map {rand} 1..100;
my @data = (0.1, 0.15, 0.3, 0.35, 0.5, 0.55, 0.7, 0.75);

# initialize by choosing random points the data
my @center = @data[ map {rand @data} 1..\$num_clust ];

my \$diff;
do {
\$diff = 0;

# Assign points to nearest center
my @cluster;
foreach my \$point (@data) {
my \$closest = 0;
my \$dist = abs \$point - \$center[ \$closest ];
for my \$idx (1..\$#center) {
if (abs \$point - \$center[ \$idx ] < \$dist) {
\$dist = abs \$point - \$center[ \$idx ];
\$closest = \$idx;
}
}
push @cluster, [\$point, \$closest];
}

# compute new centers
foreach my \$center_idx (0..\$#center) {
my @members = grep {\$_->[1] == \$center_idx} @cluster;
my \$sum = 0;
foreach my \$member (@members) {
\$sum += \$member->[0];
}
my \$new_center = @members ? \$sum / @members : \$center[ \$center
+_idx ];
\$diff += abs \$center[ \$center_idx ] - \$new_center;
\$center[ \$center_idx ] = \$new_center;
}

} while (\$diff > \$tol);

print "Centers are:\n";
foreach my \$center_idx (0..\$#center) {
print "\$center_idx    \$center[ \$center_idx ]\n";
}

-Mark

Replies are listed 'Best First'.
Re^2: Making sense of data: Clustering OR A coding challenge
by mahesh557 (Novice) on Jun 07, 2016 at 19:33 UTC

The algorithm is generating some FalsePositives, hence added Fine Tuning through iterations

```use warnings;
use strict;
use Data::Dumper;

#my @data = map {rand} 1..100;
my @dt = (1,2,3,40,40,40,40,42,43,45,80,85,90,91,91,91,91,4,9,10);
my @clustercenters =  getClusterCenters(3,@dt);
@clustercenters = sort { \$a <=> \$b } @clustercenters;
my (\$low, \$medium, \$high) = @clustercenters;
my %tags = ( \$low => "low",
\$medium => "medium",
\$high =>"high",
);

print ("\n\n \$low \t\$medium \t\$high\n");
print "\nclosest(12): ", \$tags{ closest(12, @clustercenters) };
print "\nclosest(43): ", \$tags{ closest(43, @clustercenters) };
print "\n";

sub closest {
my (\$val,@arr) = @_;
my @list = sort { abs(\$a - \$val) <=> abs(\$b - \$val) } @arr;
return \$list[0];
}

sub getClusterCenters{
my (\$n, @data) = @_;
my \$iter = 4;
my @centers = ();

for (1..\$iter){
my @clustercenters =  get1DClusterCenters(\$n,@data);
@clustercenters = sort { \$a <=> \$b } @clustercenters;
print "\n",join("\t", @clustercenters);
my @tcenters = @clustercenters;

for(my \$i=0; \$i <= \$#clustercenters; \$i++){   \$centers[\$i] +=
+\$clustercenters[\$i];   }

}

print "\n",join("\t", @centers );
@centers = map { \$_ = \$_ / \$iter; } @centers;

return @centers;
}

# It takes a 1D array of values and returns centers of clusters sorted
sub get1DClusterCenters{
my (\$num_clust, @data) = @_;

my \$tol = 0.001;    # stopping tolerance

# initialize by choosing random points the data
my @center = @data[ map {rand @data} 1..\$num_clust ];

my \$diff;
my @members;
my @cluster;

do {
\$diff = 0;

# Assign points to nearest center
my @cluster;
foreach my \$point (@data) {
my \$closest = 0;
my \$dist = abs \$point - \$center[ \$closest ];
for my \$idx (1..\$#center) {
if (abs \$point - \$center[ \$idx ] < \$dist) {
\$dist = abs \$point - \$center[ \$idx ];
\$closest = \$idx;
}
}
push @cluster, [\$point, \$closest];
}

# compute new centers
foreach my \$center_idx (0..\$#center) {
@members = grep {\$_->[1] == \$center_idx} @cluster;
my \$sum = 0;
#  print "\n\n** group \$center_idx \n";
foreach my \$member (@members) {
#     print "\t ",\$member->[0];
\$sum += \$member->[0];
}
my \$new_center = @members ? \$sum / @members : \$center[ \$ce
+nter_idx ];
\$diff += abs \$center[ \$center_idx ] - \$new_center;
\$center[ \$center_idx ] = \$new_center;
}

} while (\$diff > \$tol);

#print "Centers are:\n";
my @cluster_means = ();

foreach my \$center_idx (0..\$#center) {
#print "\n\$center_idx    \$center[ \$center_idx ]\n";
push (@cluster_means, int(\$center[ \$center_idx ]) );
}
@cluster_means = sort { \$a <=> \$b } @cluster_means;

# print "\nCLUSTER MEANS: ", join(",", @cluster_means);
return @cluster_means;

}

Create A New User
Node Status?
node history
Node Type: note [id://541005]
help
Chatterbox?
and all is quiet...

How do I use this? | Other CB clients
Other Users?
Others exploiting the Monastery: (4)
As of 2018-06-21 03:22 GMT
Sections?
Information?
Find Nodes?
Leftovers?
Voting Booth?
Should cpanminus be part of the standard Perl release?

Results (117 votes). Check out past polls.

Notices?