This is OT, but here's an approch to doing the actual translation by lookup rather than by a large, nested if...elsif in a subroutine. Should be faster. Some attempts at data validation are made, but no guarantee this is bulletproof. Uses state feature and the // operator, both from 5.10+; if you don't have 5.10, let me know and I'll make work-arounds.
use warnings;
use strict;
use feature qw(state);
use Test::More
# tests => ?? + 1 # Test::NoWarnings add 1 test
'no_plan'
;
use Test::NoWarnings;
use Data::Dump;
my @codons = qw(/GC./ /TG[TC]/ /GA[TC]/ /GA[AG]/);
my @aminos = qw( A C D E);
push @codons, qw(/TT[TC]/ /GG./ /CA[TC]/ /AT[TCA]/);
push @aminos, qw( F G H I);
push @codons, qw(/AA[AG]/ /TT[AG]|CT./ /ATG/ /AA[TC]/);
push @aminos, qw( K L M N);
push @codons, qw(/CC./ /CA[AG]/ /CG.|AG[AG]/ /TC.|AG[TC]/);
push @aminos, qw( P Q R S);
push @codons, qw(/AC./ /GT./ /TGG/ /TA[TC]/ /TA[AG]|TGA/);
push @aminos, qw( T V W Y _);
die qq{lists of codons and aminos not the same length}
if @codons != @aminos;
my %xlate; # translate base triplets to amino
for (0 .. $#codons) {
add_to_table(\%xlate, codons_to_amino($codons[$_], $aminos[$_])),
}
# dd \%xlate; # FOR DEBUG - see what we got
for my $ar_vector (
# base sequence expected protein
[ qw(GTTACTGAGTGTGGTTGGCAGACTGCTGGTTGCCGTATGTAT VTECGWQTAGCRMY) ],
[ qw(CTTTCTCATTGTGATGTTAAGGATTGGATGTGCTGGCTTCTG LSHCDVKDWMCWLL) ],
[ qw(GCTTGGACTTGTGTTGAGATTGATGGTCATTTCTCTATGAAT AWTCVEIDGHFSMN) ],
) {
my ($bases, $protein) = @$ar_vector;
is encode(\%xlate, $bases), $protein, qq{'$bases' -> '$protein'};
}
# subroutines ######################################################
sub encode {
my ($hr_xlate,
$bases,
) = @_;
$bases = canonicalize($bases);
die qq{number of bases not multiple of 3} if length($bases) % 3;
# translate NON-overlapping base triplets to aminos.
$bases =~ s{ (...) }{ $hr_xlate->{$1} // '*' }xmsge;
return $bases; # bases now translated to protein sequence
}
# subroutines for building translation table.
sub canonicalize { return uc $_[0]; }
sub codons_to_amino {
my ($codon_pat, # pattern of codon(s)
$amino, # amino codon(s) map to
) = @_;
$codon_pat = canonicalize($codon_pat);
$amino = canonicalize($amino);
# translate regex to glob form.
state $bases = canonicalize('ATCG');
state $base = qr{ [$bases] }xms;
state $any = qr{ \. }xms;
state $some = qr{ \[ $base{2,3} \] }xms;
state $codon = qr{ $base $base (?: $base | $any | $some) }xms;
state $triplet = qr{ \A $base{3} \z }xms;
state $any_glob = '{' . join(q{,}, split '', $bases) . '}';
my @codon_groups = $codon_pat =~ m{ $codon }xmsg;
for (@codon_groups) {
s{ $any }{$any_glob}xmsg;
s{ ($some) }{ some_glob($1) }xmsge;
}
my @triplets = glob(qq{@codon_groups});
for (@triplets) {
die qq{'$_' not a triplet} if $_ !~ $triplet;
}
return $amino, @triplets;
}
sub some_glob {
my ($some_pat, # 2-3 bases in regex class: [ATC] [AT]
) = @_;
my $bases = join q{,}, $some_pat =~ m{ \w }xmsg;
return qq{{$bases}};
}
sub add_to_table {
my ($hr_table,
$amino,
@triplets,
) = @_;
for my $triplet (@triplets) {
die qq{'$triplet' already in table}
if exists $hr_table->{$triplet};
$hr_table->{$triplet} = $amino;
}
}