Re: [OT]: How do I stop appending data: Dispatch translation

This is OT, but here's an approch to doing the actual translation by lookup rather than by a large, nested if...elsif in a subroutine. Should be faster. Some attempts at data validation are made, but no guarantee this is bulletproof. Uses state feature and the // operator, both from 5.10+; if you don't have 5.10, let me know and I'll make work-arounds.

use warnings;
use strict;
use feature qw(state);


use Test::More
  # tests => ?? + 1  # Test::NoWarnings add 1 test
    'no_plan'
    ;
use Test::NoWarnings;

use Data::Dump;


my   @codons = qw(/GC./  /TG[TC]/  /GA[TC]/  /GA[AG]/);
my   @aminos = qw(   A        C        D          E);

push @codons,  qw(/TT[TC]/  /GG./  /CA[TC]/  /AT[TCA]/);
push @aminos,  qw(     F       G        H          I);

push @codons,  qw(/AA[AG]/  /TT[AG]|CT./  /ATG/  /AA[TC]/);
push @aminos,  qw(     K              L      M        N);

push @codons,  qw(/CC./  /CA[AG]/  /CG.|AG[AG]/  /TC.|AG[TC]/);
push @aminos,  qw(   P        Q             R             S);

push @codons,  qw(/AC./  /GT./  /TGG/  /TA[TC]/  /TA[AG]|TGA/);
push @aminos,  qw(   T      V      W        Y              _);

die qq{lists of codons and aminos not the same length}
    if @codons != @aminos;


my %xlate;  # translate base triplets to amino

for (0 .. $#codons) {
    add_to_table(\%xlate, codons_to_amino($codons[$_], $aminos[$_])),
    }

# dd \%xlate;  # FOR DEBUG - see what we got


for my $ar_vector (
  #    base sequence                              expected protein
  [ qw(GTTACTGAGTGTGGTTGGCAGACTGCTGGTTGCCGTATGTAT VTECGWQTAGCRMY) ],
  [ qw(CTTTCTCATTGTGATGTTAAGGATTGGATGTGCTGGCTTCTG LSHCDVKDWMCWLL) ],
  [ qw(GCTTGGACTTGTGTTGAGATTGATGGTCATTTCTCTATGAAT AWTCVEIDGHFSMN) ],
  ) {

  my ($bases, $protein) = @$ar_vector;

  is encode(\%xlate, $bases), $protein, qq{'$bases' -> '$protein'};

  }


# subroutines ######################################################

sub encode {

    my ($hr_xlate,
        $bases,
        ) = @_;

    $bases = canonicalize($bases);

    die qq{number of bases not multiple of 3} if length($bases) % 3;

    # translate NON-overlapping base triplets to aminos.
    $bases =~ s{ (...) }{ $hr_xlate->{$1} // '*' }xmsge;

    return $bases;  # bases now translated to protein sequence

    }


# subroutines for building translation table.

sub canonicalize { return uc $_[0]; }


sub codons_to_amino {

    my ($codon_pat,  # pattern of codon(s)
        $amino,      # amino codon(s) map to
        ) = @_;

    $codon_pat = canonicalize($codon_pat);
    $amino     = canonicalize($amino);

    # translate regex to glob form.

    state $bases = canonicalize('ATCG');

    state $base  = qr{ [$bases]         }xms;
    state $any   = qr{ \.               }xms;
    state $some  = qr{ \[ $base{2,3} \] }xms;

    state $codon   = qr{ $base $base (?: $base | $any | $some) }xms;
    state $triplet = qr{ \A $base{3} \z }xms;

    state $any_glob = '{' . join(q{,}, split '', $bases) . '}';

    my @codon_groups = $codon_pat =~ m{ $codon }xmsg;

    for (@codon_groups) {
        s{  $any   }{$any_glob}xmsg;
        s{ ($some) }{ some_glob($1) }xmsge;
        }

    my @triplets = glob(qq{@codon_groups});

    for (@triplets) {
        die qq{'$_' not a triplet} if $_ !~ $triplet;
        }

    return $amino, @triplets;

    }


sub some_glob {

    my ($some_pat,  # 2-3 bases in regex class: [ATC] [AT]
        ) = @_;

    my $bases = join q{,}, $some_pat =~ m{ \w }xmsg;

    return qq{{$bases}};

    }


sub add_to_table {

    my ($hr_table,
        $amino,
        @triplets,
        ) = @_;

    for my $triplet (@triplets) {
        die qq{'$triplet' already in table}
            if exists $hr_table->{$triplet};
        $hr_table->{$triplet} = $amino;
        }

    }
[download]

Comment on Re: [OT]: How do I stop appending data: Dispatch translation Select or Download Code


Keep It Simple, Stupid
	PerlMonks