#!/usr/bin/perl
use strict;
use warnings;
$/ = ''; # Set paragraph mode
my @count;
my %absent;
my $name;
while ( my $para = <DATA> ) {
# Remove fasta header line
if ( $para =~ s/^>(.*)//m ){
$name = $1;
};
# Remove comment line(s)
$para =~ s/^\s*#.*//mg;
my %prot;
$para =~ s/([ACDEFGHIKLMNPQRSTVWY])/ ++$prot{ $1 } /eg;
my $len = length($para);
my $num = scalar keys %prot;
push @count,[$num,$name];
printf "Counted %d for %s ..\n",$num,substr($name,0,50);
print "$name\n";
print join( ' ', map "$_=$prot{$_}", sort keys %prot ), "\n";
printf "Amino acid alphabet = %d\n\n",$num ;
print "Sequence length = $len\n";
# count absent
for ('A'..'Z'){
++$absent{$_} unless exists $prot{$_};
};
};
# sort names by count in ascending order to get lowest
my @sorted = sort { $a->[0] <=> $b->[0] } @count;
my $lowest = $sorted[0]->[0];
# maybe more than 1 lowest
printf "Least number of amino acids is %d in these entries\n",$lowest;
my @lowest = grep { $_->[0] == $lowest } @sorted;
print "$_->[1]\n" for @lowest;
# show all results
print "\nAll results in ascending count\n";
for (@sorted){
printf "%d %s\n", @$_;
};
print "\nExclusion of various amino acids is as follows\n";
for (sort keys %absent){
printf "%s=%d\n",$_,$absent{$_};
};
__DATA__
>sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcripti
+on-activation domain
VQLQESGGGLVQAGGSLRLSCAASGRAVSMYNMGWFRQAPGQERELVAAISRGGSIYYA
DSVKGRFTISRDNAKNTLYLQMNNLKPEDTGVYQCRQGSTLGQGTQVTVSS
>sp_0017_CaCamelidSorghum bicolor multidrug and toxic compound extrusi
+on sbmate
HVQLVESGGGSVQAGGSLRLTCAASGFTFSNYYMSWVRQAPGKGLEWVSSIYSVGSNGYY
ADSVKGRSTISRDNAKNTLYLQMNSLKPEDTAVYYCAAEPGGSWWDAYSYWGQGTQVTVS S
which prints:
Counted 19 for sp_0005_SySynthetic ConstructTumor protein p53 N-t ..
sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcriptio
+n-activation domain
A=9 C=2 D=3 E=4 F=2 G=15 I=3 K=3 L=9 M=3 N=5 P=2 Q=10 R=8 S=12 T=6 V=8
+ W=1 Y=5
Amino acid alphabet = 19
Sequence length = 124
Counted 20 for sp_0017_CaCamelidSorghum bicolor multidrug and tox ..
sp_0017_CaCamelidSorghum bicolor multidrug and toxic compound extrusio
+n sbmate
A=10 C=2 D=4 E=4 F=2 G=15 H=1 I=2 K=4 L=7 M=2 N=5 P=3 Q=6 R=4 S=18 T=7
+ V=10 W=5 Y=10
Amino acid alphabet = 20
Sequence length = 143
Least number of amino acids is 19 in these entries
sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcriptio
+n-activation domain
All results in ascending count
19 sp_0005_SySynthetic ConstructTumor protein p53 N-terminal transcri
+ption-activation domain
20 sp_0017_CaCamelidSorghum bicolor multidrug and toxic compound extr
+usion sbmate
Exclusion of various amino acids is as follows
B=2
H=1
J=2
O=2
U=2
X=2
Z=2
including Sequence length = 124 and Sequence length = 143. Is there a problem with your input data such as line endings that aren't as you expect?
Optimising for fewest key strokes only makes sense transmitting to Pluto or beyond
|