You have some problems with your code. For example, you have created 55 variables, some of them duplicates, while I was able to write the same code with only 21 variables. You use a hash to collect the file list and then copy its keys to an array when you only need the array. You use lists of scalars when you could just use a single array. You use three separate hashes when you could use a hash of arrays.
Here is my version of your code:
#!/usr/bin/perl
use warnings;
use strict;
for my $files ( 1 .. 50 ) {
$files *= 200;
my @file_list; # list of files
my %file_gene;
for my $i ( 1 .. $files ) {
open my $INF, '<', "$i.txt" or die "Cannot open '$i.txt' becau
+se: $!";
chomp( my @g = <$INF> );
@{ $file_gene{ $i } }{ @g } = ();
push @file_list, $i;
}
# overlapping percentage of file pairs
# percentage of common items in file1
# percentage of common items in file2
my %file_pairs;
my $time = time;
my $pairs = 0;
for my $i ( 0 .. $#file_list ) {
my $m1 = $file_list[ $i ];
my ( @values, $pair ) = ( 0, 0, 0 );
for my $m2 ( @file_list[ $i + 1 .. $#file_list ] ) {
++$pairs;
my @Nvalues = find_common_items( @file_gene{ $m1, $m2 }, 0
+.1 );
if ( $Nvalues[ 0 ] > $values[ 0 ] ) {
$pair = $m1 . '_' . $m2; # file pair name
@values = @Nvalues;
}
}
if ( $pair ) {
$file_pairs{ $pair } = \@values;
}
}
$time = time - $time;
print "$files files\t$pairs pairs\t$time sec\n";
}
sub find_common_items {
my ( $m1, $m2, $cutoff ) = @_;
# find number of intersection items
my $isn = grep exists $m2->{ $_ }, keys %$m1;
# number of union items
my $un = keys %{ { %$m1, %$m2 } };
# only store qualified file pairs
my @values = ( 0, 0, 0 );
if ( $isn / $un > $cutoff ) {
# For file pair m1_m2, overlapping percentage of file pairs =
+intersection/union
# For file pair m1_m2, percentage of common genes in file m1 =
+ intersection/size of file m1
# For file pair m1_m2, percentage of common genes in file m2 =
+ intersection/size of file m2
@values = ( $isn / $un, $isn / keys %$m1, $isn / keys %$m2 )
}
return @values;
}