--------------------- / \ | Preparation Phase | ----------\------- ------------/------------ / -------\/------------ \ | Modeling Phase || Implementation Phase | \ /\ / ------------------ ------------------------ Fig. 1: Machine Learning as a Three-Phase Process #### | Memberships Label | Symbol | Cluster_1 | Cluster_2 | Cluster_3 -------------------------------------------------- BLUE | VRSN | 0.77 | 0.07 | 0.16 RED | EBAY | 0.06 | 0.09 | 0.85 GREEN | NFLX | 0.03 | 0.84 | 0.13 ???? | GOOG | 0.35 | 0.31 | 0.33 Fig 2. Partition Matrix for some symbols in the Nasdaq composite. #### #!/usr/bin/perl use strict; use warnings; use Finance::YahooQuote; # ================================ # read_info # @symbols = read_info( $file_name ) # ================================ sub read_info { open FILE, '<', $_[0] || die "Couldn't open file: $!"; my @info; while ( ) { chomp; push ( @info, $_ ); } close( FILE ); return @info; } my $symbols_filename = shift @ARGV; my $columns_filename = shift @ARGV; my $quotes_filename = shift @ARGV; unless ( defined( $symbols_filename ) ) { die "Error you must provide a symbols filename!\n"; } unless ( defined( $columns_filename ) ) { die "Error you must provide a columns filename!\n"; } unless ( defined( $quotes_filename ) ) { die "Error you must provide a quotes filename!\n"; } my @symbols = read_info( $symbols_filename ); my @columns = read_info( $columns_filename ); my $arrptr = getcustomquote(\@symbols, \@columns); my $i = 0; open QUOTESFILE, '>', $quotes_filename || die "Couldn't open file: $!"; foreach my $symbol (@symbols){ my @quotes = @{$arrptr->[$i++]}; print QUOTESFILE "$symbol\t@quotes\n"; } close( QUOTESFILE ); #### #!/usr/bin/perl use warnings; use strict; use PDL; use PDL::NiceSlice; # test_fcm: fuzzy c-means implementation in Perl # usage: test_fcm [number_of_clusters] [fuzzification_factor] # [max_iter] [tolerace] # returns: prototypes, partition_matrix # # ================================ # initialize_partition_matrix # partition_matrix = # initialize_partition_matrix( # num_clusters, num_patterns) # ================================ sub initialize_partition_matrix { my $partition_matrix = random($_[1],$_[0]); my $idx = which( $partition_matrix == 0 ); $partition_matrix( $idx ) .= 1e-10; $partition_matrix /= sumover($partition_matrix->xchg(1, 0)); return $partition_matrix; } # ================================ # fcm # ( $performance_index, $prototypes, $current_partition_matrix) = # fcm( $patterns, $partition_matrix, $fuzzification_factor, # $tolerance, $max_iter ) # ================================ sub fcm { # # fuzzy c means implementation # my ( $patterns, $current_partition_matrix , $fuzzification_factor, $tolerance , $max_iter ) = @_; my ( $number_of_patterns, $number_of_clusters ) = $current_partition_matrix->dims(); my ( $prototypes, $performance_index ); my $iter = 0; while (1) { # computing each prototype my $temporal_partition_matrix = $current_partition_matrix ** $fuzzification_factor; my $temp_prototypes = ($temporal_partition_matrix x $patterns)->xchg(1,0) / sumover($temporal_partition_matrix); $prototypes = $temp_prototypes->xchg(1,0); # copying partition matrix my $previous_partition_matrix = $current_partition_matrix->copy; # updating the partition matrix my $dist = zeroes($number_of_patterns , $number_of_clusters); for my $j (0..$number_of_clusters - 1){ my $diff = $patterns - $prototypes(:,$j)->dummy(1 , $number_of_patterns); $dist(:,$j) .= (sumover( $diff ** 2 )) ** 0.5; } my $temp_variable = $dist ** ( -2 / ( $fuzzification_factor - 1) ); $current_partition_matrix = $temp_variable / sumover($temp_variable->xchg(1,0)); # # Performance Index calculation # $temporal_partition_matrix = $current_partition_matrix ** $fuzzification_factor; $performance_index = sum($temporal_partition_matrix * ( $dist ** 2 )); # checking stop conditions my $diff_partition_matrix = $current_partition_matrix - $previous_partition_matrix; $iter++; if ( ($diff_partition_matrix->max < $tolerance) || ($iter > $max_iter) ) { last; } print "iter = $iter\n"; } return ( $performance_index, $prototypes , $current_partition_matrix ); } # ================================ # read_data # (@symbols, $data) = read_data( $file_name ) # ================================ sub read_data { open FILE, '<', $_[0] || die "Couldn't open file: $!"; my @symbols; my @numerical_data; while ( ) { my @tmp; chomp; my @fields = split; push( @symbols, shift(@fields) ); $_ = shift( @fields ); # Last Trade Date push( @tmp, shift( @fields ) ); # Last Trade (Price Only) $_ = shift( @fields ); # Change in Percent if (/(\-?\d+\.?\d*)%/) { push( @tmp, $1 ); } else { push( @tmp, 0 ); } push( @tmp, shift( @fields ) ); # Book Value push( @tmp, shift( @fields ) ); # EPS Est. Current Yr push( @tmp, shift( @fields ) ); # EPS Est. Next Year push( @tmp, shift( @fields ) ); # Average Daily Volume push( @tmp, shift( @fields ) ); # Day's min $_ = shift( @fields ); # - push( @tmp, shift( @fields ) ); # Day's max push( @tmp, shift( @fields ) ); # 52 weeks min $_ = shift( @fields ); # - push( @tmp, shift( @fields ) ); # 52 weeks max push( @tmp, shift( @fields ) ); # 50-day Moving Avg push( @tmp, shift( @fields ) ); # 200-day Moving Avg $_ = shift( @fields ); # Market Capitalization if (/(\d+\.?\d*)M/) { push( @tmp, $1 * 1000000 ); } elsif (/(\d+\.?\d*)B/) { push( @tmp, $1 * 1000000000 ); } else { push( @tmp, 0 ); } $_ = shift( @fields ); # Pct Chg From 50-day Moving Avg if (/(\-?\d+\.?\d*)%/) { push( @tmp, $1 ); } else { push( @tmp, 0 ); } $_ = shift( @fields ); # Pct Chg From 200-day Moving Avg if (/(\-?\d+\.?\d*)%/) { push( @tmp, $1 ); } else { push( @tmp, 0 ); } $_ = shift( @fields ); # Pct Chg From 52-wk Low if (/(\-?\d+\.?\d*)%/) { push( @tmp, $1 ); } else { push( @tmp, 0 ); } $_ = shift( @fields ); # Pct Chg From 52-wk High if (/(\-?\d+\.?\d*)%/) { push( @tmp, $1 ); } else { push( @tmp, 0 ); } push @numerical_data, [ @tmp ]; } close( FILE ); my $data = pdl ( @numerical_data ); return ($data, @symbols); } # ================================ # normalize # ( $output_data, $mean_of_input, $stdev_of_input) = # normalize( $input_data ) # # processess $input_data so that $output_data # has 0 mean and 1 stdev # # $output_data = ( $input_data - $mean_of_input ) / $stdev_of_input # ================================ sub normalize { my ( $input_data ) = @_; my ( $mean, $stdev, $median, $min, $max, $adev ) = $input_data->xchg(0,1)->statsover(); my $idx = which( $stdev == 0 ); $stdev( $idx ) .= 1e-10; my ( $number_of_dimensions, $number_of_patterns ) = $input_data->dims(); my $output_data = ( $input_data - $mean->dummy(1, $number_of_patterns) ) / $stdev->dummy(1, $number_of_patterns); return ( $output_data, $mean, $stdev ); } # # reading data # my $quotes_filename = shift @ARGV; unless ( defined( $quotes_filename ) ) { die "Error you must provide a quotes filename!\n"; } my ($data, @symbols) = read_data( $quotes_filename ); my $number_of_patterns = $data->getdim(1); my ( $patterns, $mean_of_input, $stdev_of_input) = normalize( $data ); # # assigning other variables # my $number_of_clusters = shift @ARGV; my $fuzzification_factor = shift @ARGV; my $max_iter = shift @ARGV; my $tolerance = shift @ARGV; unless (defined($number_of_clusters)) { $number_of_clusters ||= 3; } unless (defined($fuzzification_factor)) { $fuzzification_factor ||= 2.0; } unless (defined($max_iter)) { $max_iter ||= 2000; } unless (defined($tolerance)) { $tolerance ||= 0.00001; } $number_of_clusters = abs($number_of_clusters); $fuzzification_factor = abs($fuzzification_factor); $max_iter = abs($max_iter); $tolerance = abs($tolerance); # # initializing partition matrices # my $previous_partition_matrix; my $current_partition_matrix = initialize_partition_matrix($number_of_clusters, $number_of_patterns); # # output variables # my ( $prototypes, $performance_index, $partition_matrix ); ( $performance_index, $prototypes, $partition_matrix) = fcm( $patterns, $current_partition_matrix, $fuzzification_factor, $tolerance, $max_iter ); print "=======================================\n"; print "clustering completed\n"; print "performance index = $performance_index\n"; print "prototypes = \n"; print $prototypes; print "partition matrix = \n"; print transpose( $partition_matrix ); my $new_prototypes = ( $prototypes * $stdev_of_input->dummy(1, $prototypes->getdim(1)) ) + $mean_of_input->dummy(1, $prototypes->getdim(1)); print "new prototypes = \n"; print $new_prototypes; #print $data; use PDL::Graphics::PGPLOT; use POSIX( 'floor', 'ceil'); my $opt = {Device => '/xs', XTitle => "50-day Moving Avg", YTitle => "200-day Moving Avg"}; my $win = PDL::Graphics::PGPLOT::Window->new($opt); $win->points($data(13, :), $data(14, :), {SYMBOL=>4, COLOR=>'BLACK'} ); $win->hold(); my (@x_list, @y_list, $x, $y, $text); @x_list = list($new_prototypes(13, 0)); @y_list = list($new_prototypes(14, 0)); $x = floor( $x_list[0] * 10) / 10; $y = floor( $y_list[0] * 10) / 10; $win->points($new_prototypes(13, 0), $new_prototypes(14, 0), {SYMBOL=>3, COLOR=>'RED', CHARSIZE=>3}); $win->points($new_prototypes(13, 0), $new_prototypes(14, 0), {SYMBOL=>4, COLOR=>'RED', CHARSIZE=>25}); $text = "( $x %, $y % )"; $win->text($text, int( $x ), int( $y ) + 2, {CHARSIZE=>1, COLOR=>'RED', Justification => 0.5} ); @x_list = list($new_prototypes(13, 1)); @y_list = list($new_prototypes(14, 1)); $x = floor( $x_list[0] * 10) / 10; $y = floor( $y_list[0] * 10) / 10; $win->points($new_prototypes(13, 1), $new_prototypes(14, 1), {SYMBOL=>3, COLOR=>'GREEN', CHARSIZE=>3}); $win->points($new_prototypes(13, 1), $new_prototypes(14, 1), {SYMBOL=>4, COLOR=>'GREEN', CHARSIZE=>25}); $text = "( $x %, $y % )"; $win->text($text, int( $x ), int( $y ) + 2, {CHARSIZE=>1, COLOR=>'GREEN', Justification => 0.5} ); @x_list = list($new_prototypes(13, 2)); @y_list = list($new_prototypes(14, 2)); $x = floor( $x_list[0] * 10) / 10; $y = floor( $y_list[0] * 10) / 10; $win->points($new_prototypes(13, 2), $new_prototypes(14, 2), {SYMBOL=>3, COLOR=>'BLUE', CHARSIZE=>3}); $win->points($new_prototypes(13, 2), $new_prototypes(14, 2), {SYMBOL=>4, COLOR=>'BLUE', CHARSIZE=>25}); $text = "( $x %, $y % )"; $win->text($text, int( $x ), int( $y ) + 2, {CHARSIZE=>1, COLOR=>'BLUE', Justification => 0.5} ); #### AMZN BOBJ COGN DELL EBAY EHTH ETFC GOOG IFUL IMAX INTC INTU MSFT NATI NDAQ NFLX NGPS NOVL ORCL VRSN YHOO ZOOM NGPS NVDA PDLI BRCM SGIC SNDK SPSS SUNW SWIR SYMC TIVO TWTC XLNX #### Last Trade Date Last Trade (Price Only) Change in Percent Book Value EPS Est. Current Yr EPS Est. Next Year Average Daily Volume Day's Range 52-week Range 50-day Moving Avg 200-day Moving Avg Market Capitalization Pct Chg From 50-day Moving Avg Pct Chg From 200-day Moving Avg Pct Chg From 52-wk Low Pct Chg From 52-wk High #### AMZN 7/23/2007 71.74 +0.15% 0.863 1.01 1.32 14824200 70.85 - 72.67 25.76 - 75.35 70.8386 52.0728 29.392B +1.27% +37.77% +178.49% -4.79% BOBJ 7/23/2007 42.86 +0.47% 19.161 2.05 2.41 1344020 42.40 - 43.05 19.75 - 43.32 40.4564 38.3114 4.102B +5.94% +11.87% +117.01% -1.06% COGN 7/23/2007 40.90 -0.27% 8.437 2.02 2.32 1458090 40.61 - 41.19 26.21 - 45.30 39.8072 41.1105 3.635B +2.75% -0.51% +56.05% -9.71% DELL 7/23/2007 29.34 +0.55% 0.00 1.34 1.61 18368500 29.13 - 29.60 18.95 - 29.61 28.0628 25.3937 65.546B +4.55% +15.54% +54.83% -0.91% EBAY 7/23/2007 33.09 -1.31% 8.463 1.37 1.60 14405900 32.941 - 33.65 22.83 - 35.41 32.3561 32.4922 45.129B +2.27% +1.84% +44.94% -6.55% EHTH 7/23/2007 19.10 -0.31% 4.517 0.43 0.60 189470 18.98 - 19.21 17.67 - 28.88 18.9175 21.4296 430.3M +0.96% -10.87% +8.09% -33.86% ETFC 7/23/2007 22.41 +0.40% 10.186 1.65 1.91 6877300 22.23 - 22.80 20.82 - 26.08 23.4208 23.068 9.566B -4.32% -2.85% +7.64% -14.07% GOOG 7/23/2007 512.51 -1.46% 59.206 15.31 19.47 5361690 512.15 - 520.00 363.36 - 558.58 525.365 484.958 159.7B -2.45% +5.68% +41.05% -8.25% IFUL 7/23/2007 2.30 +1.32% 0.829 0.00 0.00 14528 2.2852 - 2.33 2.09 - 3.18 2.4169 2.5521 29.6M -4.84% -9.88% +10.05% -27.67% IMAX 7/23/2007 4.98 +2.68% -0.814 -0.44 -0.22 342934 4.90 - 5.21 3.32 - 10.92 4.355 4.5293 200.6M +14.35% +9.95% +50.00% -54.40% INTC 7/23/2007 24.72 +0.69% 6.867 1.12 1.37 64725500 24.57 - 24.97 16.84 - 26.52 23.8286 21.6269 143.6B +3.74% +14.30% +46.79% -6.79% INTU 7/23/2007 30.26 +0.73% 5.802 1.40 1.60 3757790 30.0252 - 30.59 26.74 - 35.98 29.8644 29.5778 10.190B +1.32% +2.31% +13.16% -15.90% MSFT 7/23/2007 31.19 +0.10% 3.654 1.72 1.94 59226100 31.12 - 31.52 23.00 - 31.84 30.1494 29.5672 298.4B +3.45% +5.49% +35.61% -2.04% NATI 7/23/2007 35.05 +1.92% 7.501 1.16 1.35 410333 34.64 - 35.10 24.48 - 35.52 32.5322 28.9151 2.798B +7.74% +21.22% +43.18% -1.32% NDAQ 7/23/2007 33.50 -0.89% 12.808 1.27 1.73 2814900 33.17 - 33.88 25.80 - 42.37 31.6075 31.7301 3.775B +5.99% +5.58% +29.84% -20.93% NFLX 7/23/2007 17.27 -12.02% 6.286 0.76 0.95 1495580 17.17 - 18.09 18.12 - 30.00 20.4217 22.1396 1.179B -15.43% -21.99% -4.69% -42.43% NGPS 7/23/2007 39.09 -1.16% 9.75 2.39 2.78 57516 38.79 - 39.76 30.72 - 48.25 37.5439 38.7596 334.8M +4.12% +0.85% +27.25% -18.98% NOVL 7/23/2007 7.51 +1.08% 3.259 0.10 0.18 5249540 7.44 - 7.55 5.70 - 8.26 7.8095 7.279 2.616B -3.83% +3.17% +31.75% -9.08% ORCL 7/23/2007 20.78 +0.82% 3.313 1.17 1.34 28994100 20.64 - 20.91 14.49 - 20.94 19.7828 18.3754 106.2B +5.04% +13.09% +43.41% -0.76% VRSN 7/23/2007 32.12 -0.31% 10.248 1.07 1.38 4146770 32.04 - 32.59 15.95 - 34.68 30.8847 26.9547 7.832B +4.00% +19.16% +101.38% -7.38% YHOO 7/23/2007 24.99 -1.42% 6.657 0.43 0.57 28871200 24.98 - 25.46 22.65 - 33.61 27.2897 29.0362 33.583B -8.43% -13.94% +10.33% -25.65% ZOOM 7/23/2007 1.11 -7.50% 1.246 0.00 0.00 19500 1.08 - 1.24 0.91 - 2.80 1.2942 1.4052 10.4M -14.23% -21.01% +21.98% -60.36% NGPS 7/23/2007 39.09 -1.16% 9.75 2.39 2.78 57516 38.79 - 39.76 30.72 - 48.25 37.5439 38.7596 334.8M +4.12% +0.85% +27.25% -18.98% NVDA 7/23/2007 45.55 +1.00% 5.775 1.85 2.11 9899090 44.84 - 46.48 17.63 - 47.83 41.1581 34.4343 16.531B +10.67% +32.28% +158.37% -4.77% PDLI 7/23/2007 25.69 -0.93% 4.043 0.53 0.78 2463490 25.54 - 26.08 16.39 - 27.98 24.7697 22.7361 2.996B +3.72% +12.99% +56.74% -8.18% BRCM 7/23/2007 33.94 -1.45% 7.285 1.18 1.45 12093900 33.91 - 34.50 21.98 - 37.50 30.9053 32.2809 18.362B +9.82% +5.14% +54.41% -9.49% SGIC 7/23/2007 24.55 -3.00% 10.831 0.00 -1.98 25991 24.48 - 26.30 17.50 - 30.66 27.1856 27.5898 273.1M -9.69% -11.02% +40.29% -19.93% SNDK 7/23/2007 56.94 -0.04% 21.015 1.34 2.43 8809480 56.82 - 58.05 35.82 - 62.24 48.4567 43.8729 12.994B +17.51% +29.78% +58.96% -8.52% SPSS 7/23/2007 46.32 +1.51% 9.712 1.51 1.77 303541 45.65 - 46.90 21.73 - 47.41 43.9433 37.921 850.3M +5.41% +22.15% +113.16% -2.30% SUNW 7/23/2007 5.29 -0.75% 1.962 0.09 0.19 76935904 5.26 - 5.39 3.81 - 6.78 5.1727 5.7094 18.887B +2.27% -7.35% +38.85% -21.98% SWIR 7/23/2007 27.76 +0.07% 6.183 0.89 1.18 711200 27.07 - 28.06 10.58 - 28.10 25.3156 18.2702 714.4M +9.66% +51.94% +162.38% -1.21% SYMC 7/23/2007 19.80 +0.20% 12.899 1.12 1.26 13626300 19.76 - 20.09 15.08 - 22.19 19.8222 18.5712 17.840B -0.11% +6.62% +31.30% -10.77% TIVO 7/23/2007 6.24 +3.65% 0.238 -0.27 -0.09 1427150 6.16 - 6.3764 5.05 - 8.37 5.9394 6.0122 608.6M +5.06% +3.79% +23.56% -25.45% TWTC 7/23/2007 20.23 -1.32% 3.866 -0.22 0.33 2309090 20.21 - 20.69 14.06 - 23.97 20.0219 20.7936 2.927B +1.04% -2.71% +43.88% -15.60% XLNX 7/23/2007 26.72 -1.98% 5.991 1.22 1.46 6240820 26.67 - 27.47 18.35 - 30.50 27.5378 26.6528 7.959B -2.97% +0.25% +45.61% -12.39%