Hi, I have 10,000 input files and each of them contains a list of numbers. What I want to do is comparing those files pairwisely and find common items between any two files. At the end, I'll store my results in three hashes.
1. common items/total number of items in file 1 and file 2
2. common items/number of items in file 1
3. common items/number of items in file 2
However, i did a test and found that the time needed to calculate the overlapping between two files increases with the number of the total files.
Here is my time log:
300 files: 0.15 ms (calculation time for each file pair)
500 files: 0.19 ms
700 files: 0.24 ms
900 files: 0.28 ms
1100 files: 0.33 ms
1300 files: 0.37 ms
2500 files: 0.55 ms
4500 files: 0.9 ms
My computer has sufficient memory and I'm pretty sure it didn't do memory swapping when running the script. Can anyone kindly tell me why the unit time increases with the number of input files?
Thanks!
#!/usr/bin/perl
use strict;
for(my $r = 200; $r<=10000; $r = $r + 200){
my %file_list; # list of files
my %file_gene;
for(my $i = 1; $i <=$r; $i++){
my $file = "$i.txt";
open(INF, "$file");
while(my $g=<INF>){
chomp $g;
$file_gene{$i}{$g} = ();
}
close INF;
$file_list{$i} = ();
}
close IN;
my %hash3; # overlapping percentage of file pairs
my %hash4; # percentage of common items in file1
my %hash5; # percentage of common items in file2
my @file_list_array = keys %file_list; # list of file names
my $file_number = $#file_list_array; # number of files - 1
my @time = localtime(time);
my $hr1 = $time[2];
my $min1 = $time[1];
my $sec1 = $time[0];
my $x = 0;
for(my $i = 0; $i<= $file_number - 1; $i++){
my $m1 = $file_list_array[$i];
my $value3 = 0;
my $value4 = 0;
my $value5 = 0;
my $pair;
for(my $j = $i + 1; $j <= $file_number; $j++){
$x = $x + 1;
my $m2 = $file_list_array[$j];
my ($Nvalue3, $Nvalue4, $Nvalue5) = fi
+nd_common_items($m1, $m2, \%file_gene, 0.1);
if($Nvalue3 > $value3){
$pair = $m1."_".$m2; # file pa
+ir name
$value3 = $Nvalue3;
$value4 = $Nvalue4;
$value5 = $Nvalue5;
}
}
if($pair){
$hash3{"$pair"} = $value3;
$hash4{"$pair"} = $value4;
$hash5{"$pair"} = $value5;
}
}
my @time = localtime(time);
my $hr2 = $time[2];
my $min2 = $time[1];
my $sec2 = $time[0];
my $hr = $hr2 - $hr1;
my $min = $min2 - $min1;
my $sec = $sec2 - $sec1;
my $time = $hr*3600 + $min*60 + $sec;
my $unit_time = $time/$x;
print "$r files\t$unit_time sec\n";
}
sub find_common_items{
my ($m1, $m2, $file_gene_ref, $cutoff) = @_;
my %file_genes = %$file_gene_ref;
my %hash1;
my %hash2;
my %hash1 = %{$file_genes{$m1}}; # genes in file m1
my %hash2 = %{$file_genes{$m2}}; # genes in file m2
my %intersection; # intersection items
my %union; # union items
my $value3 = 0;
my $value4 = 0;
my $value5 = 0;
# find intersection items
foreach(keys %hash1){
$intersection{$_} = $hash1{$_} if exists $hash2{$_};
}
my $isn = scalar keys %intersection; # number of intersection
+items
# find union items
@union{keys %hash1, keys %hash2} = ();
my $un = scalar keys %union; # number of union items
# only store qualified file pairs
if($isn/$un > $cutoff){
my $s1 = scalar keys %hash1; # number of items in file
+ m1, size of file m1
my $s2 = scalar keys %hash2; # number of items in file
+ m2, size of file m2
$value3 = $isn/$un; # For file pair m1_m2, overlapping
+ percentage of file pairs = intersection/union
$value4 = $isn/$s1; # For file pair m1_m2, percentage
+ of common genes in file m1 = intersection/size of file m1
$value5 = $isn/$s2; # For file pair m1_m2, percentage
+ of common genes in file m2 = intersection/size of file m2
}
return($value3, $value4, $value5);
}