#!/tools/bin/perl

use strict;
use warnings;

## Variables and Data Structures
my $count = 1;
my @probesArray
my %probes;
my $size = 100;

## Reading the file
open my $FILE, "data.txt" or die "ERROR: cannot read file $!\n";
while (my $line = <$FILE>){
chomp $line;
my @line = split('\t',$line);
$probes{$line[0]} = [@line[1 .. $#line]]; ## value of the hash as an array
$probesArray[$count] = $line[0]; ## correlation between 1-2 or 2-1 will be same so using calculating only once can be done thru this array
$count++;
}
close($FILE); ## Frankly speaking reading of the file takes less than 3 sec with 50,000 categories each having 100 values

## Correlation Calculation
for(my $i = 0; $i <= $count-1; $i++){
	for(my $j = $i+1; $j <= $count; $j++){
		my @probe1 = @{$probes{$probesArray[$i]}}; ## Array of data
		my @probe2 = @{$probes{$probesArray[$j]}};
		my $cor = correlation(\@probe1, \@probe2, \$size); ## correlation is the subroutine
		$calProbes{$probesArray[$i]."-".$probesArray[$j]} = $cor;
#		print $count,"\t",$probesArray[$i]."-".$probesArray[$j],"\t",$cor,"\n";
		$count++; 
	}
}

## Subroutines
sub mean {
	my ($arr1, $arr2, $size) = @_;
   	my @arr1 = @$arr1;
   	my @arr2 = @$arr2;
   	my $mu_x = sum(@arr1) / $$size;
   	my $mu_y = sum(@arr2) / $$size;
   	return($mu_x,$mu_y);
}
 
## Sum of Squared Deviations to the mean
sub ss {
	my ($arr1, $arr2, $mean_x,$mean_y) = @_;
   	my @arr1 = @$arr1;
   	my @arr2 = @$arr2;
   	my ($ssxx, $ssxy, $ssyy) = (0) x 3;
 
   	## looping over all the samples	  
	for(my $i = 0; $i <= scalar(@arr1)-1; $i++){
    	$ssxx = $ssxx + ($arr1[$i] - $mean_x)**2;
     	$ssxy = $ssxy + ($arr1[$i] - $mean_x)*($arr2[$i] - $mean_y) ;
     	$ssyy = $ssyy + ($arr2[$i] - $mean_y)**2;
	}
	return ($ssxx, $ssxy, $ssyy);
}

## Pearson Correlation Coefficient
sub correlation {
	my ($arr1, $arr2, $size) = @_;
   	my ($mean_x,$mean_y) = mean($arr1, $arr2, $size);
   	my ($ssxx, $ssxy, $ssyy) = ss($arr1, $arr2, $mean_x, $mean_y);
   	my $cor = $ssxy/sqrt($ssxx*$ssyy);
   	return($cor);
}