yuvraj_ghaly has asked for the wisdom of the Perl Monks concerning the following question:
I have created this script however I am not getting any output
#!/usr/bin/perl -w
# calculate pI.
# based on example from, and using code from, Fred Lindberg (lindberg@
+id.wustl.edu)
# this script came from http://www.id.wustl.edu/~lindberg/docs/program
+s/, but see
# http://www-nmr.cabm.rutgers.edu/bioinformatics/ZebaView/help.html
use strict;
my $file = shift || die "pI.pl <fasta file of sequences>;\n";
my $allowed="ACDEFGHIKLMNPQRSTVWY";# Supported amino acids
my @acids=('C','D','E','Y'); # Acidic for pI
my @bases=('H','K','R'); # Basic for pI
my $water=18.0152; # mol wt of H2O (added decimals, since
# it's multiplied by no_aa-1).
# molecular weights
my %aawt=('A', 89.09, 'C', 121.16, 'D', 133.10, 'E', 147.13, 'F', 165.
+19,
'G', 75.07, 'H', 155.16, 'I', 131.18, 'K', 146.19, 'L', 131.18,
'M',149.21, 'N', 132.12, 'P', 115.13, 'Q', 146.15, 'R', 174.20,
'S',105.09, 'T', 119.12, 'V', 117.15, 'W', 204.23, 'Y', 181.19)
+;
# pKa/pKbs (< & > are default NT & CT)
my %pka= ('C', 9.0, 'D', 4.05, 'E', 4.45, 'Y', 10.0,
'H', 5.98, 'K', 10.0, 'R', 12.0);
# NT pKa
my %nt= ('A', 7.59, 'E', 7.70, 'M', 7.00,
'P', 8.36, 'S', 6.93, 'T', 6.82, 'V', 7.44);
# CT pKa
my %ct= ('D', 4.55, 'E', 4.75);
# A280s (half cysteine for C)
my %A280 = ('W', 5690., 'Y', 1280., 'C', 120.);
my %protein;
{
open (IN, $file) || die "Can't open $file\n";
my $tag; my $seq;
while (<IN>) {
chomp;
if (/^>/) {
if ($seq) {$protein{$tag} = uc($seq); undef $seq}
$tag = $_;
}
else {$seq .= $_}
}
close IN;
}
open OUT, "> $file.txt" || die "Can't open > $file.txt for writing";
foreach my $proteinid (keys %protein) {
my $protein = $protein{$proteinid};
my $first = substr($protein,0,1); # get NT aa
my $last = substr($protein, -1,1); # get CT aa
my %base; my %acid;
$base{'start'}=1; # One amino terminus
$acid{'end'}=1; # One carboxy terminus
# NT different for some aa, 7.50 default
#if($nt{$first}) {$pka{'start'} = $nt{$first}} else {$pka{'start'} =
+ 7.50}
# CT different for some aa, 3.55 default
#if($ct{$last}) {$pka{'end'} = $ct{$last}} else {$pka{'end'} = 3.55}
# count the amino acid composition
my %residue;
foreach my $aa (split(//, $allowed)) {$residue{$aa} = ($protein =~ s
+/$aa//g)}
foreach my $aa (@acids) {
if($residue{$aa}) {
$acid{$aa} = $residue{$aa}; # collect acids
}
}
foreach my $aa (@bases) {
if($residue{$aa}) {
$base{$aa} = $residue{$aa}; # collect bases
}
}
# binary search for pI
my $hi=14; # Theoretical max
my $lo=0; # Theoretical min
my $pI=7;
my $old_pI=1;
my $iterations = 0;
while(abs($pI-$old_pI)>0.001) { # Two correct decimals
if($iterations++ > 15) { # 14/0.001 > 2^14, so this shouldn'
+t happen
print STDERR "Excessive iterations. Something's badly wrong\n";
last;
}
my $result=0;
foreach my $aa (keys(%acid)) { # Left (acid) side
unless ($pka{$aa}) {print STDERR "pka for $aa NOT FOUND\n"}
$result += $acid{$aa}/(1+10**($pka{$aa}-$pI));
}
foreach my $aa (keys(%base)) { # Right (base) side
$result -= $base{$aa}/(1+10**($pI-$pka{$aa}));
}
$old_pI = $pI;
if($result > 0){
$pI=($pI+$lo)/2; # Go lower since charge is neg
$hi=$old_pI;
} else {
$pI=($hi+$pI)/2; # Go higher; charge is pos
$lo=$old_pI;
}
}
print OUT "$proteinid\t$pI\n";
}
Re: Iso electric point calculation using perl
by hippo (Bishop) on Jul 19, 2013 at 11:04 UTC
|
If %protein is empty by the time you get to line 58, you won't get any output. Without seeing your data file, that looks eminently possible.
| [reply] [d/l] |
Re: Iso electric point calculation using perl
by Cristoforo (Curate) on Jul 19, 2013 at 23:44 UTC
|
One error I found was the last sequence wasn't being added to the %protein hash. You need to add it at the end of the while loop.
{
open (IN, $file) || die "Can't open $file\n";
my $tag; my $seq;
while (<IN>) {
chomp;
if (/^>/) {
if ($seq) {$protein{$tag} = uc($seq); undef $seq}
$tag = $_;
}
else {$seq .= $_}
}
$protein{$tag} = uc($seq); # add this line to get the last sequence
close IN;
}
With a few other adjustments, I was able to get the script to run. However, I don't know what changes need to be made to get accurate results.
With an inputfile of:
>chr1 RecName: Full=Putative transcription factor 001R;
MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQVECPKAPVEWNNPPS
EKGLIVGHFSGIKYKGEKAQASEVDVNKMCCWVSKFKDAMRRYQGIQTCKIPGKVLSDLD
AKIKAYNLTVEGVEGFVRYSRVTKQHVAAFLKELRHSKQYENVNLIHYILTDKRVDIQHL
EKDLVKDFKALVESAHRMRQGHMINVKYILYQLLKKHGHGPDGPDILTVKTGSKGVLYDD
SFRKIYTDLGWKFTPL
>chr2 RecName: Full=Uncharacterized protein 002L;
MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQTCASGFCTSQPLCAR
IKKTQVCGLRYSSKGKDPLVSAEWDSRGAPYVRCTYDADLIDTQAQVDQFVSMFGESPSL
AERYCMRGVKNTAGELVSRVSSDADPAGGWCRKWYSAHRGPDQDAALGSFCIKNPGAADC
KCINRASDPVYQKVKTLHAYPDQCWYVPCAADVGELKMGTQRDTPTNCPTQVCQIVFNML
DDGSVTMDDVKNTINCDFSKYVPPPPPPKPTPPTPPTPPTPPTPPTPPTPPTPRPVHNRK
VMFFVAGAVLVAILISTVRW
I got results from your program below
>chr1 RecName: Full=Putative transcription factor 001R; 9.425903320
+3125
>chr2 RecName: Full=Uncharacterized protein 002L; 8.2193603515625
These results vary from a program I wrote using Bio::Tools::pICalculator. The results I got from it were:
chr1 9.793558
chr2 7.791410
That program code was:
#!/usr/bin/perl
use strict;
use warnings;
use Bio::Tools::pICalculator;
use Bio::SeqIO;
@ARGV == 1 or die "Usage: perl $0 fasta_file_name\n";
my $in = Bio::SeqIO->new( -file => $ARGV[0] ,-format => 'Fasta' );
my $calc = Bio::Tools::pICalculator->new(-places => 6);
while ( my $seq = $in->next_seq ) {
$calc->seq($seq);
my $iep = $calc->iep;
print join("\t", $seq->id, $iep), "\n";
}
I got the code from the documentation for that module.
Chris | [reply] [d/l] [select] |
|
Thank tou chris. I have two queries:
First, what command have you input on console screen?
Second, thoe code for correction is still giving problem...this time it says "use of uninitialsed value at 104 & 107 line"...
| [reply] |
|
The command at the console was perl your_prog_name.pl fasta_input_file.fasta. I don't know what lines 104 and 107 are. Please put them here.
| [reply] [d/l] |
|
|
|
Re: Iso electric point calculation using perl
by erix (Prior) on Jul 19, 2013 at 23:09 UTC
|
Looking at the original code from that page, I conclude that it is the body of a subroutine, and not a standalone program.
So, wrap it in a subroutine (say, calculate_protein_details and call that subroutine with 2 arguments: a protein sequence $protein (the input), and an empty hash %results (the output, which whill be filled in by the subroutine).
#!/usr/bin/perl
use strict;
use warnings;
main();
exit;
sub main {
# I took this example sequence from uniprot.org
# from entry >sp|P02754|LACB_BOVIN Beta-lactoglobulin
my $protein = "MKCLLLALALTCGAQALIVTQTMKGLDIQKVAGTWYSLAMAASDISLLDAQSA
+PLRVYVEELKPTPEGDLEILLQKWENGECAQKKIIAEKTKIPAVFKIDALNENKVLVLDTDYKKYLLFC
+MENSAEPEQSLACQCLVRTPEVDDEALEKFDKALKALPMHIRLSFNPTQLEEQCHI";
my %results = ();
calculate_protein_details( $protein, \%results);
while (my ($k,$v) = each %results)
{
print $k, " => ", $v, "\n";
}
}
sub calculate_protein_details {
my ($protein, $hash_ref) = @_;
# copy here the code (except the first line) from
# http://www-nmr.cabm.rutgers.edu/bioinformatics/ZebaView/help.html
# (not your mangled code...)
}
Output:
extinction => 16960
pI => 4.93
MW_N15 => 20105.52
MW => 19883.00
MW_SeMet => 20117.50
MW_N15_C13 => 20986.05
( Don't complicate matters and solve one problem at a time. Reading a fasta file to get its sequence is a entirely separate thing )
UPDATED: restructured the code a bit | [reply] [d/l] [select] |
|
| [reply] |
|
No. I didn't look at your code. It's obviously too long and you said it didn't do what you wanted and I was only interested in the code on the rutgers.edu website so I made a program to wrap that into.
Just take the code I gave you, save it as a file, and copy the code from the rutgers.edu website into the calculate_protein_details sub that I gave you (except the first line (of the rutgers.edu code)). Then it just works. I tried it again, just now. If you want it to read the sequence from a fasta file, add that functionality afterwards, but that's a completely different 'problem'.
Remember: when things don't work you have to simplify till you understand why it doesn't.
| [reply] [d/l] |
|
| [reply] |
Re: Iso electric point calculation using perl
by mtmcc (Hermit) on Jul 19, 2013 at 11:59 UTC
|
| [reply] [d/l] [select] |
|
| [reply] [d/l] |
|
My mistake! Thanks for pointing that out!
| [reply] |
Re: Iso electric point calculation using perl
by Laurent_R (Canon) on Jul 19, 2013 at 21:34 UTC
|
# calculate pI.
Hmmm, have you considered this:
print "pI = ", (4 * atan2 1, 1), "\n";
or, even simpler:
print "pI = ", 3.14159265, "\n"
?
OK, I've tried to make some fun, I am not sure that this was really funny, but, frankly, my point is that you are posting a 120-line script obviously highly specialized in biology without giving any explanation of what it is supposed to do, just saying that you don't get any output. I am actually almost surprised that you found some people able to give an answer.
I did some study in biology, but last time I had a course in biology and related sciences was in my first year of medecine study back in 1973-1974, after which I decided to give up medecine (because I found that I did not like it, I actually passed my first year exams, but did not see myself going through that for still many years) and to go for maths and physics, which led me a few years later to software engineering and IT science. This is just to say that I followed some university courses in biology, biochemistry, biophysics, genetics, etc., so that I am not a complete ignoramus on the subject, but sufficiently long ago to have only very vague remembrance (as far as genetics is concerned, the knowledge in this field has so much progressed over the last 40 years that, even if I remembered today everything that I was supposed to learn at the time, my knowledge would probably be far lower than what a high school student is supposed to know today).
In brief, I have no clue on what your program is doing, I don't understand anything, and I think most people here are in more or less the same situation. It would be nice to give some explanations on what you expect.
BTW, just a small slightly off-topic question. From reading through your code (you see... I made an effort), I got the understanding that pI is required to be between 0 and 14, and that sounds like min and max values of pH in chemistry. My gut feeling is that this is extremely unlikely to be a coincidence, so, what is the relation between pH and pI?
| [reply] [d/l] [select] |
|
See As the caption of my talk is to calculate Iso electric point of protein sequences which you havenot understood properly.
I have a Fasta file containing numerous protein sequences in Fasta format. You can check NCBI homepage for that. www.ncbi.nlm.nih.gov. My task is to calculate Isoelectric point of each protein sequences. The script I composed is not helping me. I'm not a Perl expert so I commited some mistake. I really dont know what. I keep on trying but I'm keep messing around. I need expert advice.
| [reply] |
|
| [reply] |
|
|
|
|
|
|
|
Re: Iso electric point calculation using perl
by mtmcc (Hermit) on Jul 19, 2013 at 12:56 UTC
|
If I were you, I'd email the author (lindberg@id.wustl.edu) and ask him about his script. I don't know a whole lot about isoelectric points, but this seems to work:
#!/usr/bin/perl -w
# calculate pI.
# based on example from, and using code from, Fred Lindberg (lindberg@
+id.wustl.edu)
# this script came from http://www.id.wustl.edu/~lindberg/docs/program
+s/, but see
# http://www-nmr.cabm.rutgers.edu/bioinformatics/ZebaView/help.html
use strict;
use warnings;
my $file = shift || die "pI.pl <fasta file of sequences>;\n";
print STDERR "start\n";
my $allowed="ACDEFGHIKLMNPQRSTVWY";# Supported amino acids
my @acids=('C','D','E','Y'); # Acidic for pI
my @bases=('H','K','R'); # Basic for pI
my $water=18.0152; # mol wt of H2O (added decimals, since
# it's multiplied by no_aa-1).
# molecular weights
my %aawt=('A', 89.09, 'C', 121.16, 'D', 133.10, 'E', 147.13, 'F', 165.
+19,
'G', 75.07, 'H', 155.16, 'I', 131.18, 'K', 146.19, 'L', 131.18,
'M',149.21, 'N', 132.12, 'P', 115.13, 'Q', 146.15, 'R', 174.20,
'S',105.09, 'T', 119.12, 'V', 117.15, 'W', 204.23, 'Y', 181.19)
+;
# pKa/pKbs (< & > are default NT & CT)
my %pka= ('C', 9.0, 'D', 4.05, 'E', 4.45, 'Y', 10.0,
'H', 5.98, 'K', 10.0, 'R', 12.0);
# NT pKa
my %nt= ('A', 7.59, 'E', 7.70, 'M', 7.00,
'P', 8.36, 'S', 6.93, 'T', 6.82, 'V', 7.44);
# CT pKa
my %ct= ('D', 4.55, 'E', 4.75);
# A280s (half cysteine for C)
my %A280 = ('W', 5690., 'Y', 1280., 'C', 120.);
my @currentProtein;
my %protein;
{
open (IN, $file) || die "Can't open $file\n";
my $tag; my $seq;
while (<IN>) {
#print STDERR "$_";
chomp;
if (($_ !~ /^>/) && ($_ =~ /\w/))
{
push (@currentProtein, $_);
}
if (/^>/)
{
$tag = $_;
}
if ((($_ !~ /\w/) || (eof)) && (@currentProtein > 0))
{
$seq = join("", @currentProtein);
$protein{$tag} = uc($seq);
@currentProtein = ();
}
}
close IN;
}
#open OUT, "> $file.txt" || die "Can't open > $file.txt for writing";
my $counter = keys %protein;
foreach my $proteinid (keys %protein) {
my $protein = $protein{$proteinid};
my $first = substr($protein,0,1); # get NT aa
my $last = substr($protein, -1,1); # get CT aa
my %base; my %acid;
$base{'start'}=1; # One amino terminus
$acid{'end'}=1; # One carboxy terminus
# NT different for some aa, 7.50 default
if($nt{$first}) {$pka{'start'} = $nt{$first}} else {$pka{'start'} =7
+.50}
# CT different for some aa, 3.55 default
if($ct{$last}) {$pka{'end'} = $ct{$last}} else {$pka{'end'} = 3.55}
# count the amino acid composition
my %residue;
foreach my $aa (split('', $allowed)) {$residue{$aa} = ($protein =~ s
+/$aa//g)}
foreach my $aa (@acids) {
if($residue{$aa}) {
$acid{$aa} = $residue{$aa}; # collect acids
}
}
foreach my $aa (@bases) {
if($residue{$aa}) {
$base{$aa} = $residue{$aa}; # collect bases
}
}
# binary search for pI
my $hi=14; # Theoretical max
my $lo=0; # Theoretical min
my $pI=7;
my $old_pI=1;
my $iterations = 0;
while(abs($pI-$old_pI)>0.001) { # Two correct decimals
if($iterations++ > 15) { # 14/0.001 > 2^14, so this shouldn'
+t happen
print STDERR "Excessive iterations. Something's badly wrong\n";
last;
}
my $result=0;
foreach my $aa (keys(%acid)) { # Left (acid) side
unless ($pka{$aa}) {print STDERR "pka for $aa NOT FOUND\n"}
$result += $acid{$aa}/(1+10**($pka{$aa}-$pI));
}
foreach my $aa (keys(%base)) { # Right (base) side
$result -= $base{$aa}/(1+10**($pI-$pka{$aa}));
}
$old_pI = $pI;
if($result > 0){
$pI=($pI+$lo)/2; # Go lower since charge is neg
$hi=$old_pI;
} else {
$pI=($hi+$pI)/2; # Go higher; charge is pos
$lo=$old_pI;
}
}
print STDERR "$proteinid\t$pI\n";
}
| [reply] [d/l] |
|
your code is running but here is a error i have found in your code. The output it is showing contains pI of first sequence in first place however it eliminated the pI of last sequence and after first sequence result it shows the pI of second last sequence then then showed pI result in descending manner.
this is my sample.fasta containing sequences
>gi|226694487|sp|Q1DF98.2|ATPF_MYXXD RecName: Full=ATP synthase subuni
+t b; AltName: Full=ATP synthase F(0) sector subunit b; AltName: Full=
+ATPase subunit I; AltName: Full=F-type ATPase subunit b; Short=F-ATPa
+se subunit b
MFLPSVLAASNLVKVQPGLIFWTLVTFVIAAVVLKWKAWGPILSLVEEREKQIASSIESAKRERAEAEKL
LADQKTAIAEARREAAEMMRRNTQEMEKFREELMAKSRKEAEELKLSARREIDEQKAKAIAEVRSMAVDL
AMEVAGKLISERMDDSKQRALAEQFVQGLPLNSTSATGAVRRTA<br><br>
>gi|172046103|sp|Q1D4N0.2|LIPA_MYXXD RecName: Full=Lipoyl synthase; Al
+tName: Full=Lip-syn; Short=LS; AltName: Full=Lipoate synthase; AltNam
+e: Full=Lipoic acid synthase; AltName: Full=Sulfur insertion protein
+LipA
MTETTRKPEWLKVRLPHGEGYERVKAIVKRTKLATVCEEARCPNIAECWGGGTATVMLMGEVCTRACRFC
HVKVGAPPPLDPMEPIHLAQAVKEMDLEYIVVTSVNRDDRPDGGASHFASAIRELRRESPRTIVEVLIPD
FKGVEKDLTTVAEAKPHVVAHNVETVERLTPTVRDRRAKYHQSLRVLEYLKNRPEGLYTKTSVMVGLGET
DAELEQTFKDLRDVGVDVLTLGQYLQPSQYHLRVERFVTPAQFEAYKTLAESYGFLYVASGPLVRSSYRA
AEFFMKGLMERERLERLG<br><br>
>gi|123374798|sp|Q1DDB3.1|KDSB_MYXXD RecName: Full=3-deoxy-manno-octul
+osonate cytidylyltransferase; AltName: Full=CMP-2-keto-3-deoxyoctulos
+onic acid synthase; Short=CKS; Short=CMP-KDO synthase
MQSCRTVAVIPARHASTRFPGKPLAIIAGRTMIEHVWRRCQEAQAFDEVWVATDDDRIRAAVEGFGGKAV
MTSPACATGTDRVAEVALGRPDIDIWVNVQGDEPLVDPATLQRLAGLFQDASVRMGTLVRPLEADEAASP
HVVKAVLALNGDALYFSRSLVPHVREPGTPVQRWGHIGLYGYRREVLLSLAKLAPTPLEDAEKLEQLRAL
EHGIPIRCAKVTSHTVAVDLPGDVEKVEALMRARGG<br><br>
>gi|123374766|sp|Q1DCG7.1|FMT_MYXXD RecName: Full=Methionyl-tRNA formy
+ltransferase
MSRPRIVFMGTPEFAVSSLAACFELGDVVAVVTQPDKPKGRGNTVTAPPVKELALSRGVPVLQPTKLRTP
PFAEELRQYAPDVCVVTAYGRILPKDLLELPTHGCVNVHGSLLPRFRGAAPIQWAIAHGDTETGVSLMVM
DEGLDTGPVLAMKRMAIAPDETSASLYPKLAALGGEVLREFLPAYLSGELKPVPQPSEGMVLAPIIEKDQ
GRLDFTKPAVELERRLRAFTPWPGAFTTLGGKLLKVHRAQARGGSGAPGTVLASGPDGIEVACGEGSLVL
LDLQPEGKRVMRAADFLQGHKLAPGSQPFVAG<br><br>
>gi|123374695|sp|Q1DAN7.1|SSRP_MYXXD RecName: Full=SsrA-binding protei
+n
MTSGGKSKGVGSEPGVRVIAENRRARFDYTVDEKVEAGLALTGSEVKSLRDGIANLSDAYALPKGDELFL
LNANIGSYKAASFFDHLPTRGRKLLMHRGEIDRWTAKVRERGYSIIPLVLYFRNGRAKVELGLCRGKTHE
DRRHDIKERETKREMDRAMRRR<br><br>
>gi|123374693|sp|Q1DAM1.1|PNP_MYXXD RecName: Full=Polyribonucleotide n
+ucleotidyltransferase; AltName: Full=Polynucleotide phosphorylase; Sh
+ort=PNPase
MLKKSVKIGESELSIEVGRLAKQADGSVVVRYGDTMLLVTAVSAREKKDIDFLPLTVEYQEKLYSAGRIP
GSYFKREGRLTEKETLASRLVDRSCRPLFPEGYAYETQIIASVISSDPENEGDIHGITGASAALWVSDIP
FDGPIAGIRVGRVGGQLVANPTAKQREQSDLDLVMAVSRKAIVMVEGGAEEVSEADMVAALDFGFTTAQP
ALDLQDELRRELNKQVRSFEKPAAVDEGLRAKVRELAMDGIKAGYGIKEKGARYEALGKTKKEALAKLKE
QLGDGYTPLVEKHAKAVVEDLKYEHMREMTVNGGRIGDRGHDVVRSITCEVGVLPRTHGSAVFTRGETQA
LVVTTLGTSDDEQRLEMLGGMAFKRFMLHYNFPPFSVNETKPLRGPGRREVGHGALAERALRNMVPKSES
FPYTVRLVSDILESNGSSSMASVCGGTLALMDAGVPLKAPVAGIAMGLVKEGDKIAILSDILGDEDHLGD
MDFKVCGTSKGITSIQMDIKITGLTTEIMSRALEQARQGRLHILGEMLKTLAESRKEISQYAPRITTIQI
RPEFIKNVIGPGGKVIKDIIARTGAAINIEDSGRVDIASANGEAVKAAIAMIQALTREAEIGKIYTGTVR
KIAEFGAFVELFPGTDGLIHISELSDKRVKSVSDVLNEGDEVLVKVVSIDKTGKIRLSRKEAMAERAAQQ
GAAAGEAAAQPAPAPTQPDAKA
this is the output it showed using your code
<code>>gi|226694487|sp|Q1DF98.2|ATPF_MYXXD RecName: Full=ATP synthase subunit b; AltName: Full=ATP synthase F(0) sector subunit b; AltName: Full=ATPase subunit I; AltName: Full=F-type ATPase subunit b; Short=F-ATPase subunit b 10.9674072265625
>gi|123374695|sp|Q1DAN7.1|SSRP_MYXXD RecName: Full=SsrA-binding protein 9.0909423828125
>gi|123374766|sp|Q1DCG7.1|FMT_MYXXD RecName: Full=Methionyl-tRNA formyltransferase 7.8262939453125
>gi|123374798|sp|Q1DDB3.1|KDSB_MYXXD RecName: Full=3-deoxy-manno-octulosonate cytidylyltransferase; AltName: Full=CMP-2-keto-3-deoxyoctulosonic acid synthase; Short=CKS; Short=CMP-KDO synthase 4.8543701171875
>gi|172046103|sp|Q1D4N0.2|LIPA_MYXXD RecName: Full=Lipoyl synthase; AltName: Full=Lip-syn; Short=LS; AltName: Full=Lipoate synthase; AltName: Full=Lipoic acid synthase; AltName: Full=Sulfur insertion protein LipA 8.4586181640625
| [reply] [d/l] |
|
#!/usr/bin/perl -w
# calculate pI.
# based on example from, and using code from, Fred Lindberg (lindberg@
+id.wustl.edu)
# this script came from http://www.id.wustl.edu/~lindberg/docs/program
+s/, but see
# http://www-nmr.cabm.rutgers.edu/bioinformatics/ZebaView/help.html
use strict;
use warnings;
my $file = shift || die "pI.pl <fasta file of sequences>;\n";
print STDERR "start\n";
my $allowed="ACDEFGHIKLMNPQRSTVWY";# Supported amino acids
my @acids=('C','D','E','Y'); # Acidic for pI
my @bases=('H','K','R'); # Basic for pI
my $water=18.0152; # mol wt of H2O (added decimals, since
# it's multiplied by no_aa-1).
# molecular weights
my %aawt=('A', 89.09, 'C', 121.16, 'D', 133.10, 'E', 147.13, 'F', 165.
+19,
'G', 75.07, 'H', 155.16, 'I', 131.18, 'K', 146.19, 'L', 131.18,
'M',149.21, 'N', 132.12, 'P', 115.13, 'Q', 146.15, 'R', 174.20,
'S',105.09, 'T', 119.12, 'V', 117.15, 'W', 204.23, 'Y', 181.19)
+;
# pKa/pKbs (< & > are default NT & CT)
my %pka= ('C', 9.0, 'D', 4.05, 'E', 4.45, 'Y', 10.0,
'H', 5.98, 'K', 10.0, 'R', 12.0);
# NT pKa
my %nt= ('A', 7.59, 'E', 7.70, 'M', 7.00,
'P', 8.36, 'S', 6.93, 'T', 6.82, 'V', 7.44);
# CT pKa
my %ct= ('D', 4.55, 'E', 4.75);
# A280s (half cysteine for C)
my %A280 = ('W', 5690., 'Y', 1280., 'C', 120.);
my @currentProtein;
my %protein;
my $fastaLine;
my $protSequence;
my @fastaTag;
my @tagOrder;
open (FASTA, $file) || die "Can't open $file\n";
my $tag; my $seq;
while (<FASTA>)
{
#print STDERR "CURRENT: $_";
chomp;
if (($_ !~ /^>/) && ($_ =~ /\w/))
{
push (@currentProtein, $_);
}
if ((/^>/) || (eof))
{
if ((@currentProtein > 0) || (eof))
{
$protSequence = join("", @currentProtein);
$protSequence =~ s/ //g;
@fastaTag = split(" ", $fastaLine);
$protein{$fastaTag[0]} = $protSequence;
push (@tagOrder, $fastaTag[0]);
@currentProtein = ();
}
$fastaLine = $_ if $_ =~ /\w/;
}
}
close FASTA;
#open OUT, "> $file.txt" || die "Can't open > $file.txt for writing";
my $counter = keys %protein;
for (@tagOrder)
{
#print STDERR "ID: $_\t SEQUENCE:$protein{$_}\n"; <--uncomment t
+o show tags/sequences;
my $protein = $protein{$_};
my $first = substr($protein,0,1); # get NT aa
my $last = substr($protein, -1,1); # get CT aa
my %base; my %acid;
$base{'start'}=1; # One amino terminus
$acid{'end'}=1; # One carboxy terminus
# NT different for some aa, 7.50 default
if($nt{$first}) {$pka{'start'} = $nt{$first}} else {$pka{'start'
+} =7.50}
# CT different for some aa, 3.55 default
if($ct{$last}) {$pka{'end'} = $ct{$last}} else {$pka{'end'} = 3.
+55}
# count the amino acid composition
my %residue;
foreach my $aa (split('', $allowed)) {$residue{$aa} = ($protein
+=~ s/$aa//g)}
foreach my $aa (@acids) {
if($residue{$aa}) {
$acid{$aa} = $residue{$aa}; # collect acids
}
}
foreach my $aa (@bases) {
if($residue{$aa}) {
$base{$aa} = $residue{$aa}; # collect bases
}
}
# binary search for pI
my $hi=14; # Theoretical max
my $lo=0; # Theoretical min
my $pI=7;
my $old_pI=1;
my $iterations = 0;
while(abs($pI-$old_pI)>0.001) { # Two correct decimals
if($iterations++ > 15) { # 14/0.001 > 2^14, so this shou
+ldn't happen
print STDERR "Excessive iterations. Something's badly wrong\
+n";
last;
}
my $result=0;
foreach my $aa (keys(%acid)) { # Left (acid) side
unless ($pka{$aa}) {print STDERR "pka for $aa NOT FOUND\n"}
$result += $acid{$aa}/(1+10**($pka{$aa}-$pI));
}
foreach my $aa (keys(%base)) { # Right (base) side
$result -= $base{$aa}/(1+10**($pI-$pka{$aa}));
}
$old_pI = $pI;
if($result > 0){
$pI=($pI+$lo)/2; # Go lower since charge is neg
$hi=$old_pI;
} else {
$pI=($hi+$pI)/2; # Go higher; charge is pos
$lo=$old_pI;
}
}
print STDERR "$_\t$pI\n";
}
| [reply] [d/l] |
|
|
|
|
Re: Iso electric point calculation using perl
by Anonymous Monk on Jul 19, 2013 at 12:12 UTC
|
I have created copy/pasted this script without learning how it's supposed to work, I am not getting any output
FTFY | [reply] |
|
|