Dear Monks,
I am becoming a bit more daring in trying to understand and use perl. Now I wrote the script below, which does excatly what i want but at an alarmingly slow pace. Is there any way you think I could speed it up as well as learn, so I could do this again in the future. My script compares entities in lines between 2 files
#!/usr/bin/perl
use Bio::Perl;
use IO::String;
use Bio::SeqIO;
use List::Util 'max';
use Text::CSV;
use Array::Utils qw(:all);
if (scalar(@ARGV) != 2) {
print "\n";
print "Usage: compare.pl <master_file> <query_file>\n" ;
print "The master file is the annotated one\n";
print "The query file is the non-annoted subset file\n";
print "\n"; exit();
}
my ($file1,$file2) = @ARGV; #read in orthomcl results, gene-number and
+ taxa-number
open(INFILE, $file1);
##remove/overwrite on all similarly named output files
my $remove1 = "compared_annotations.txt";
if (unlink($remove1) == 1) {
print "Existing \"$remove1\" file was removed\
+n";
}
#Create an output file
my $outputfile = "compared_annotations.txt";
if (! open(POS, ">>$outputfile") ) {
print "Cannot open file \"$outputfile\" to write to!!\n\n"
+;
exit;
}
# For each line in the input file (i.e. each ortholog group)...
while (my $line = <INFILE>) {
chomp;
# First, get the cluster number.
our ($cluster, $other) = split(/\s/, $line, 2);
#print "$cluster : ************************************************
+*********************************************************************
+**************\n";
#print "$other\n";
#declare variables
my $a;
my $b;
my $c;
my $d;
my $e;
my $f;
my $g;
my $i;
our @a = (); #make @a public
my @c = ();
##remove white spaces in the data! (very annoying)
$a = $other;
$a =~ s/[\t ]+/ /g;
$b = $a;
$b =~ s/^ //mg;
$c = $b;
$c =~ s/ $//mg;
chomp $c; # remove trailing white space in $c ('ugly stuff')
#now break the entries into pieces and store them in an array!
$d = $c;
#remove comma's
$d =~ s/,3/ 3/g;
$d =~ s/\),/) /g;
#@a = split(/(\))\s/, $d);
#@a = split(/\)\s([^\)])/, $d);
#@a = split(/(\))/, $d);
$d =~ s/\)\s/)>/g;
$e = $d;
$e =~ s/\(unknown\)//g; #remove the unknown brackets from the draf
+t genomes
$f = $e;
#push(@a, $f);
@a = split(/>/, $f); # split the second half of the genome info in
+ a line into an array
#print "@a***\n\n\n";
#foreach $g(@a){
#print "$g~~\n"; #check out the split bits
#print "$g\n";
#}
my $non_ommitted_pattern =~ /(\W+).+/;
foreach (@a) {our @match = grep {$_ == $non_ommitted_pattern} @a;
#print "@match\n\n\n\n";
}
my ($w1, @w1) = read_query_file($file2); #pretty important - ie a
+llows printing in the sub
}
###################################################
sub read_query_file
### Opens and reads file data into an array ###
{
my ($filename) = @_;
unless (open(FILEDATA, $filename))
{print "\nCannot open file \"$filename\".\n";
exit;
}
while (my $line2 = <FILEDATA>){
chomp;
# First, get the subset cluster number.
our ($cluster1, $other1) = split(/\s/, $line2, 2);
chomp $other1;
#print "$cluster1 ###############################################
+#####################################################################
+#####\n";
#print "$other1\n";
my @A2 = split(/\s/, $other1);
chomp @A2;
#foreach my $A2(@A2){
#print "$A2\n";
#}
#now do the matching
foreach $i(@a){
foreach my $i2(@A2){
if($i =~ m/$i2/){
#print POS "$cluster $cluster1 ## $i $i2 @match\n\n";
print POS "$cluster,, $cluster1,, @match,,\n";
next;
}
else{next;}
}
}
}
next;
#return;
# }
}
###################################################
1;