<?xml version="1.0" encoding="windows-1252"?>
<node id="542091" title="k-Means Clustering demo program with Tk" created="2006-04-08 19:12:59" updated="2006-04-08 15:12:59">
<type id="1042">
CUFP</type>
<author id="190859">
bart</author>
<data>
<field name="doctext">
Last week, [belg4mit] [id://541000|asked] about the k-Means Clustering algorithm, modules, tutorials and documentation. I had promised him to port the algorithm out of a  VB demo program belonging to a [http://people.revoledu.com/kardi/tutorial/kMean/index.html|k-Means Clustering tutorial] by Kardi Teknomo. I found the VB program so illustrative, that I ported the whole program to Perl/Tk, even though I've never actually used Tk for anything real. As a result, it took me a few days.

&lt;p&gt;Anyway! The result is here, the kMeans module is part of the script, but you can use it as a general library for kMeans clustering too, if you'd want to. It's not even limited to 2D. :) All you have to do is copy the bottom half of the program into its own pm file.

&lt;p&gt;Enjoy!

&lt;readmore&gt;
&lt;c&gt;
#!/usr/bin/perl -w
# k Means demo program
# Adapted from a VB program by Kardi Teknomo
# http://people.revoledu.com/kardi/tutorial/kMean/index.html
# Ported to Perl/Tk by bart @ Perlmonks

use strict;


# ----------------------- Tk interface -------------------------
use Tk;
my $mw = MainWindow-&gt;new( -height =&gt; 403, -width =&gt; 477,
  -title =&gt; "k Means Clustering, adapted from tutorial by Kardi Teknomo" );

my $button_reset = $mw-&gt;Button(-text =&gt; "Clear Data", -relief =&gt; "raised", -command =&gt; \&amp;reset_click);
$button_reset-&gt;place( -x =&gt; 176, -y =&gt; 24, -height =&gt; 25, -width =&gt; 65);

{
    my $label = $mw-&gt;Label(
      -text =&gt; "Click data in the canvas below. The program will automatically cluster the data by color code."
    );
    $label-&gt;place( -x =&gt; 0, -y =&gt; 0, -height =&gt; 16, -width =&gt; 473);

    $label = $mw-&gt;Label(-text =&gt; "Number of clusters");
    $label-&gt;place( -x =&gt; 10, -y =&gt; 28, -height =&gt; 18, -width =&gt; 95);

    $label = $mw-&gt;Label(-text =&gt; "(X, Y)", -justify =&gt; 'right');
    $label-&gt;place( -x =&gt; 280, -y =&gt; 24, -height =&gt; 13, -width =&gt; 40);
}

my $label_xy = $mw-&gt;Label( -text =&gt; "X, Y");
$label_xy-&gt;place( -x =&gt; 330, -y =&gt; 24, -height =&gt; 13, -width =&gt; 50);

my $clusters_entry = $mw-&gt;Entry(-relief =&gt; "sunken", -disabledforeground =&gt; 'darkgray');
$clusters_entry-&gt;place( -x =&gt; 112, -y =&gt; 24, -height =&gt; 24, -width =&gt; 24);
$clusters_entry-&gt;insert('end', '3');

my $canvas = $mw-&gt;Scrolled('Canvas', -scrollbars =&gt; '', -background =&gt; 'white');
$canvas-&gt;place( -x =&gt; 0, -y =&gt; 56, -height =&gt; 403-56, -width =&gt; 477);
$canvas-&gt;CanvasBind( "&lt;ButtonPress-1&gt;", [ \&amp;canvas_click, Ev('x'), Ev('y')]);
$canvas-&gt;CanvasBind( "&lt;Motion&gt;", [ \&amp;canvas_mousemove, Ev('x'), Ev('y')]);
$canvas-&gt;CanvasBind( "&lt;Leave&gt;", [ \&amp;canvas_mouseleave ]);


#----------------------- Event Handlers ------------------------

my @color = qw(red yellow green cyan blue purple gray magenta pink
  chartreuse coral darkolivegreen);
# If you want to be able to have more clusters, add more colours.


my(%point, @cluster, $dataset);

sub reset_click {
   $canvas-&gt;delete('all');
   (@cluster, %point, $dataset) = ();
   $clusters_entry-&gt;configure(-state =&gt; 'normal');
}

sub canvas_mousemove {
    my $canvas = shift;
    my($x, $y) = @_;
    $label_xy-&gt;configure(-text =&gt; "($x, $y)");
}

sub canvas_mouseleave {
    my $canvas = shift;
    $label_xy-&gt;configure(-text =&gt; "");
}

sub canvas_click {
    my $canvas = shift;
    my($x, $y) = @_;
    if(!$dataset) {
        my $clusters = $clusters_entry-&gt;get;
        if($clusters !~ /^\d+$/ or $clusters == 0 or $clusters &gt; @color) {
            warn "Not a valid value for cluster count";
            return;
        }
        $dataset  = Data::Cluster::kMean-&gt;new(0+$clusters) or die "Failed to make object";
        $clusters_entry-&gt;configure(-state =&gt; 'disabled');
    }
    $dataset-&gt;add(my $point = [ $x, $y ]);
    # A point is an array ref with coordinates
    my %record = ( data =&gt; $point, cluster =&gt; -1 );
    $record{id} = $canvas-&gt;createLine($x, $y, $x, $y,
      -fill =&gt; 'red', -width =&gt; 8,
      -capstyle =&gt; 'round', -tags =&gt; ['dot'],
    );
    # Keep track of point properties using a stringified reference to the point coordinates array
    $point{$point} = \%record;

    foreach my $cluster ($dataset-&gt;clusters) {
        my $i = $cluster-&gt;index;
        my $r = $cluster[$i] ||= { obj =&gt; $cluster, id =&gt;
          $canvas-&gt;createText($x, $y, -anchor =&gt; 'c', -width =&gt; 150, tag =&gt; 'label',
            -text =&gt; 1+$i) };

        # Move centroid label
        my($x, $y) = @{$cluster-&gt;centroid};
        $canvas-&gt;coords($r-&gt;{id}, $x, $y);

        # Colour dots according to cluster
        foreach my $p ($cluster-&gt;points) {
            if($point{$p}{cluster} != $i) {
                $point{$p}{cluster} = $i;
                $canvas-&gt;itemconfigure($point{$p}{id}, -fill =&gt; $color[$i]);
            }
        }
    }
    $canvas-&gt;raise('label', 'dot');
}

#------------------------ Main Program -------------------------
MainLoop;


#------------------------ kMean module -------------------------

package Data::Cluster::kMean;
use List::Util qw(sum);

sub new {
    my $class = shift;
    my($max_clusters) = @_;
    return bless { max_clusters =&gt; $max_clusters, data =&gt; [], cluster =&gt; [], clusters =&gt; [] }, $class;
}

sub add {
    # add data point(s) (array references) -- by reference, so make sure they're not reused for something else
    my $self = shift;
    return unless @_;
    unless(ref $_[0] eq 'ARRAY') {
        @_ = [ @_ ];
    }
    foreach my $p (@_) {
        push @{$self-&gt;{data}}, $p;
        push @{$self-&gt;{cluster}}, -1;   # not in a cluster
        if(@{$self-&gt;{clusters}} &lt; $self-&gt;{max_clusters}) {
            my $index = @{$self-&gt;{clusters}};
            push @{$self-&gt;{clusters}}, Data::Cluster::kMean::Cluster-&gt;new($self, $index);
            $self-&gt;{cluster}[-1] = $index;
        } else {
            my $c;
            {
                my $j = 0;
                my $min_dist;
                for my $cluster (@{$self-&gt;{clusters}}) {
                    my $dist = _dist($p, $cluster-&gt;centroid);
                    if(!defined $min_dist or $dist &lt; $min_dist) {
                        $c = $j;
                        $min_dist = $dist;
                    }
                } continue {
                    $j++;
                }
            }
            $self-&gt;{clusters}[$c]-&gt;invalidate;
            $self-&gt;{cluster}[-1] = $c;

            my $is_still_moving = 1;
            while($is_still_moving) {
                # this loop will surely converge

                my @centroid = map $_-&gt;centroid, @{$self-&gt;{clusters}};

                # assign all data to the new centroids
                $is_still_moving = 0;

                my $i = 0;
                for my $p (@{$self-&gt;{data}}) {
                    my $c;
                    {
                        my $min_dist;
                        for my $j (0 .. $#{$self-&gt;{clusters}}) {
                            my $dist = _dist($p, $centroid[$j]);
                            if(!defined $min_dist or $dist &lt; $min_dist) {
                                $c = $j;
                                $min_dist = $dist;
                            }
                        }
                    }
                    if($c != $self-&gt;{cluster}[$i]) {
                        $self-&gt;{clusters}[$self-&gt;{cluster}[$i]]-&gt;invalidate;
                        $self-&gt;{clusters}[$c]-&gt;invalidate;
                        $self-&gt;{cluster}[$i] = $c;
                        $is_still_moving = 1;
                    }
                } continue {
                    $i++;
                }
            }
        }
    }
}

sub clusters {
    # Returns a list of all Cluster objects
    my $self = shift;
    return @{$self-&gt;{clusters}};
}

sub _dist {
    # function
    return sqrt(sum map { my $d = $_[0][$_]-$_[1][$_]; $d*$d } 0 .. $#{$_[0]});
}


package Data::Cluster::kMean::Cluster;
use List::Util qw(sum);

sub new {
    my $class = shift;
    my($parent, $index) = @_;
    bless { index =&gt; $index, data =&gt; $parent-&gt;{data}, cluster =&gt; $parent-&gt;{cluster}, centroid =&gt; undef}, $class;
}

sub points {
    # Returns a list of all points in cluster
    my $self = shift;
    my $index = $self-&gt;{index};
    my @point = @{$self-&gt;{data}}[grep $self-&gt;{cluster}[$_] == $index, 0 .. $#{$self-&gt;{data}}];
    return @point;
}

sub centroid {
    # Returns a point indicating the cluster's center of gravity
    my $self = shift;
    return $self-&gt;{centroid} ||= _centroid($self-&gt;points);
}

sub _centroid {
    # function
    return undef unless @_;
    my $dim = @{$_[0]};
    return [ map { my $i = $_; sum(map $_-&gt;[$i], @_) / @_ } 0 .. $dim-1 ];
}

sub invalidate {
    # Throw away cache
    my $self = shift;
    undef $self-&gt;{centroid};
}

sub index {
    # integer, position in cluster array of parent
    my $self = shift;
    return $self-&gt;{index};
}

1;
&lt;/c&gt;</field>
</data>
</node>
