#!/tools/bin/perl use strict; use warnings; ## Variables and Data Structures my $count = 1; my @probesArray my %probes; my $size = 100; ## Reading the file open my $FILE, "data.txt" or die "ERROR: cannot read file $!\n"; while (my $line = <$FILE>){ chomp $line; my @line = split('\t',$line); $probes{$line[0]} = [@line[1 .. $#line]]; ## value of the hash as an array $probesArray[$count] = $line[0]; ## correlation between 1-2 or 2-1 will be same so using calculating only once can be done thru this array $count++; } close($FILE); ## Frankly speaking reading of the file takes less than 3 sec with 50,000 categories each having 100 values ## Correlation Calculation for(my $i = 0; $i <= $count-1; $i++){ for(my $j = $i+1; $j <= $count; $j++){ my @probe1 = @{$probes{$probesArray[$i]}}; ## Array of data my @probe2 = @{$probes{$probesArray[$j]}}; my $cor = correlation(\@probe1, \@probe2, \$size); ## correlation is the subroutine $calProbes{$probesArray[$i]."-".$probesArray[$j]} = $cor; # print $count,"\t",$probesArray[$i]."-".$probesArray[$j],"\t",$cor,"\n"; $count++; } } ## Subroutines sub mean { my ($arr1, $arr2, $size) = @_; my @arr1 = @$arr1; my @arr2 = @$arr2; my $mu_x = sum(@arr1) / $$size; my $mu_y = sum(@arr2) / $$size; return($mu_x,$mu_y); } ## Sum of Squared Deviations to the mean sub ss { my ($arr1, $arr2, $mean_x,$mean_y) = @_; my @arr1 = @$arr1; my @arr2 = @$arr2; my ($ssxx, $ssxy, $ssyy) = (0) x 3; ## looping over all the samples for(my $i = 0; $i <= scalar(@arr1)-1; $i++){ $ssxx = $ssxx + ($arr1[$i] - $mean_x)**2; $ssxy = $ssxy + ($arr1[$i] - $mean_x)*($arr2[$i] - $mean_y) ; $ssyy = $ssyy + ($arr2[$i] - $mean_y)**2; } return ($ssxx, $ssxy, $ssyy); } ## Pearson Correlation Coefficient sub correlation { my ($arr1, $arr2, $size) = @_; my ($mean_x,$mean_y) = mean($arr1, $arr2, $size); my ($ssxx, $ssxy, $ssyy) = ss($arr1, $arr2, $mean_x, $mean_y); my $cor = $ssxy/sqrt($ssxx*$ssyy); return($cor); }