Here is a part of the code that I have pasted followed by my question. This program uses, perl and one function below uses bioperl to obtain all the possible sequences from a given degenerate iupac string
For example, I will use the iupac string here as:
'NNYDBAVDVHVHNGGNR'
Here, A,C,G, or T are single characters, everything else either carries 2, 3 or 4 characters as follows:
N => ACGT
Y => CT
D => AGT
B => CGT
V => ACG
H => ACT
R => AG
So we can see how many possible combinations can come out of this iupac string probably >10000000 seqs with only A,C,G or T characters
#!/usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;
use Bio::SeqFeature::Primer;
use Bio::Tools::IUPAC;
use Bio::Tools::SeqPattern;
#some parameters for the equation to compute Tm
my $conc_primer = 0.25;
my $conc_salt = 50;
my $mgconc = 0;
my $iupac = 'NNYDBAVDVHVHNGGNR';
#this calls the function to generate all possible seqs from iupac stri
+ng
my @PotSeqs = find_degenerates($iupac);
my @Tm_Thermodynamic=();
foreach my $PotSeqs(@PotSeqs){
#this call the function to compute tm for each seq
my $Tm_Thermodynamic = tm_Base_Stacking($PotSeqs,$conc_primer,$conc_sa
+lt,$mgconc);
push(@Tm_Thermodynamic, $Tm_Thermodynamic);
#I have to add a function here to get the min and max tm values
+from the Tm_thermodynamic array filled above
# printf "%.4f", $Tm_Thermodynamic_min;
# print "\t";
# printf "%.4f", $Tm_Thermodynamic_max;
# print "\n";
}
#function to get all possible combination of an iupac string
sub find_degenerates
{
my $sequence = $_[0];
my $seq_obj = Bio::Seq->new(-seq => $sequence, -alphabet => 'dna')
+;
my $stream = Bio::Tools::IUPAC->new(-seq => $seq_obj);
my @oligomers;
while (my $uniqueseq = $stream->next_seq()) {
push @oligomers, $uniqueseq->seq; }
return @oligomers;
}
#Function to compute the Temperature for a given sequence (sequence va
+lid only if contains A,C,G or T)
sub tm_Base_Stacking
{
my ($c,$conc_primer,$conc_salt,$conc_mg) = @_;
my $c_len = length($c);
my $h=0;
my $s=0;
# enthalpy values
my %array_h = ( 'AA'=> '-7.9',
'AC'=> '-8.4',
'AG'=> '-7.8',
'AT'=> '-7.2',
'CA'=> '-8.5',
'CC'=> '-8.0',
'CG'=> '-10.6',
'CT'=> '-7.8',
'GA'=> '-8.2',
'GC'=> '-10.6',
'GG'=> '-8.0',
'GT'=> '-8.4',
'TA'=> '-7.2',
'TC'=> '-8.2',
'TG'=> '-8.5',
'TT'=> '-7.9'
);
# entropy values
my %array_s = ( 'AA'=> '-22.2',
'AC'=> '-22.4',
'AG'=> '-21.0',
'AT'=> '-20.4',
'CA'=> '-22.7',
'CC'=> '-19.9',
'CG'=> '-27.2',
'CT'=> '-21.0',
'GA'=> '-22.2',
'GC'=> '-27.2',
'GG'=> '-19.9',
'GT'=> '-22.4',
'TA'=> '-21.3',
'TC'=> '-22.2',
'TG'=> '-22.7',
'TT'=> '-22.2'
);
#correction for salt concentration
my $salt_effect= ($conc_salt/1000)+(($conc_mg/1000) * 140);
# effect on entropy
$s+=0.368 * ($c_len-1)* log($salt_effect);
# compute new H and s based on sequence.
for(my $i=0; $i<$c_len-1; $i++){
my $subc=substr($c,$i,2);
$h+=$array_h{$subc};
$s+=$array_s{$subc};
}
my $rlnk = ($conc_primer/2000000000);
my $r = log10($rlnk);
#equation to compute the Tm
my $tm=((1000*$h)/($s+(1.987*$r)))-273.15;
return $tm;
}
#function to take the log
sub log10 {
my $n = $_[0];
return log($n)/log(10);
}
my question is how can I make this efficient. I only am looking for the range that is Min and Max Tm or temperature values for a given iupac string. Here, what I am doing is I am computing Tm for every possible seqs of a given iupac to get the accurate range, but this is too slow and inefficient.. I am running totally out of memory and it becomes worse when iupac degeneracy increases. Is there a way I can make it efficient and still be able to get more or less an accurate range (min and max Tm values)?
I would appreciate any help/suggestions. Please let me know if I need to further explain anything.