Beefy Boxes and Bandwidth Generously Provided by pair Networks
XP is just a number
 
PerlMonks  

Comment on

( #3333=superdoc: print w/replies, xml ) Need Help??
#!/usr/bin/perl # Module and script to build a desired amino acid # sequence given a collection of nucleotide snippets # Author: Runrig (http://www.perlmonks.org) <dougw@cpan.org> # Copyright: This is free software. You can redistribute # and/or modify it under the same terms as perl itself. package DNA::Stitch; use strict; use warnings; # Table of nucleotide sequences and their corresponding # amino acid. my %amino = qw( GCT A GCC A GCA A GCG A TGT C TGC C GAT D GAC D GAA E GAG E TTT F TTC F GGT G GGC G GGA G GGG G CAT H CAC H ATT I ATC I ATA I AAA K AAG K TTA L TTG L CTT L CTA L CTG L CTC L ATG M AAT N AAC + N CCT P CCC P CCA P CCG P CAA Q CAG Q CGC R CGT R CGA R CGG R AGA R AGG R TCT S TCC S TCA S TCG S AGT S AGC S ACT T ACC T ACA T ACG T GTT V GTC V GTA V GTG V TGG W TAT Y TAC Y ); # For better performance, we could cache the following # globals (with Storable or by just Dumping them here # as hardcoded) rather than recalculating them every # time. # amino_st: For each one or two nucleotides which start # an amino acid, create a character class of which amino acids # those nucleotides can start. # first: For each amino acid, create an array of what # nucleotides can start the amino acid. # suffix: For each amino acid, create an array of what # the next nucleotide can be given the first one or two nucleotides. my (%amino_st, %first, %suffix); while ( my ($nuke, $amino) = each %amino) { my ($one, $two, $three) = split '', $nuke; undef $amino_st{$one}{$amino}; undef $amino_st{$one . $two}{$amino}; undef $first{$amino}{$one}; undef $suffix{$amino}{$one}{$two . $three}; undef $suffix{$amino}{$one . $two}{$three}; } # Regex to match invalid amino acid sequences my $invalid_amino_seq = '[^' . join('', keys %first) . ']'; $invalid_amino_seq = qr/$invalid_amino_seq/; # Collapse keys of hash ref to a simple array ref $first{$_} = [ keys %{$first{$_}} ] for keys %first; $amino_st{$_} = join '', keys %{$amino_st{$_}} for keys %amino_st; # Create a character class if there's more than one amino acid for (values %amino_st) { $_ = "[$_]" if length > 1; } # Given the first one one or two nucleotides of an amino acid, # create a regex to match the rest of the nucleotides. my %match; for my $amino (keys %suffix) { for my $prefix (keys %{$suffix{$amino}}) { my @suffix = keys %{$suffix{$amino}{$prefix}}; my %uniq; undef @uniq{map substr($_, 0, 1), @suffix}; $suffix{$amino}{$prefix} = [ keys %uniq ]; # Just join all clauses with 'or', because trying to use # character classes here only benefits the last few # clauses of a long 'or' string my $match = '^(?:' . join("|", @suffix) . ')$'; $match{$amino}{$prefix} = qr/$match/; } } # Find what amino acid sequences can match a given nucleotide # sequence for a given collection of nucleotide 'snippets'. # If you often use the same collection, you could safely cache # it with Storable. sub new { my $class = shift; my @library; for my $snip (@_) { die "Snippets must have 2 or more nucleotides" unless length($snip) >= 2; die "Bad nucleotide sequence '$snip'" if $snip =~ /[^ACGT]/; my $nuc_seq = $snip; my $prefix = ''; # First assume the amino acid starts with the first nucleotide # in a sequence. Then shift off the first nucleotide and assume # that the previous amino acid ends with the shifted nucleotide # and the amino acid sequence starts with the resulting # sequence. Repeat for the next nucleotide. SHIFT_SEQ: for my $i (0..2) { my $amino_seq = ''; while ($nuc_seq =~ /(...)/g) { next SHIFT_SEQ unless exists $amino{$1}; $amino_seq .= $amino{$1}; } my $suffix = ''; my $amino_len = length($amino_seq); if (my $suffix_len = length($nuc_seq) % 3) { $suffix = substr($nuc_seq, -$suffix_len); next SHIFT_SEQ unless exists $amino_st{$suffix}; $amino_seq .= $amino_st{$suffix}; } push @{ $library[$i] }, { SNIP=>$snip, PREFIX=>$prefix, $amino_seq ? (MATCH=>qr/^$amino_seq/) : (), SUFFIX=>$suffix, LENGTH=>$amino_len, }; } continue { $prefix .= substr($nuc_seq, 0, 1, '') if $i < 2} } bless \@library, $class; } # Build a target amino acid sequence given a collection of # nucleotide sequences, how many times we are allowed to 'glue' # them together, and how many extra nucleotides we are allowed # to just 'splice' on. sub build { my $lib = shift; die "Target is empty" if !defined($_[0]) or $_[0] eq ''; die "Bad amino acid in target" if $_[0] =~ $invalid_amino_seq; die "Glue must be integer" if defined($_[1]) and $_[1] =~ /\D/; die "Splices must be integer" if defined($_[2]) and $_[2] =~ /\D/; my $target = shift; my $glue = shift || 0; my $splices = shift || 0; $lib->_build($target, $glue, $splices, '', ''); } # Build the target amino acid, but accept additional parameters # which indicate what amino acid occurs before the target sequence # which we haven't completed yet, and what nucleotides we have # so far that begin that amino acid. sub _build { my ($lib, $target, $glue, $splices, $prefix_amino, $prefix_nuc) = @_; my $prefix_len = length $prefix_nuc; my $shift_num = (3 - $prefix_len) % 3; my @results; SEQ: for my $seq (@{ $lib->[$shift_num] }) { if ($prefix_amino) { next SEQ unless $seq->{PREFIX} =~ $match{$prefix_amino}{$prefix_nuc}; unless ($target) { push @results, [ $seq->{SNIP} ]; next SEQ; } } next SEQ if exists $seq->{MATCH} and $target !~ $seq->{MATCH}; my $nxt_target = substr($target, $seq->{LENGTH}); my $nxt_prefix_amino = $seq->{SUFFIX} ? substr($nxt_target, 0, 1, '') : ''; unless ( $nxt_target or $nxt_prefix_amino ) { push @results, [ $seq->{SNIP} ]; next SEQ; } next SEQ unless $glue; my $nxt = $lib->_build( $nxt_target, $glue-1, $splices, $nxt_prefix_amino, $seq->{SUFFIX}, ); next SEQ unless @$nxt; unshift @$_, $seq->{SNIP} for @$nxt; push @results, @$nxt; } return \@results unless $splices--; # Now allow for splicing on a single nucleotide instead of using # one of the snippets. if ($prefix_amino) { my $nxt_prefix_amino = ($prefix_len >= 2) ? '' : $prefix_amino; SPLICE1: for my $splice ( @{$suffix{$prefix_amino}{$prefix_nuc}} ) { push(@results, [ $splice ]), next SPLICE1 unless $target or $nxt_prefix_amino; my $nxt = $lib->_build( $target, $glue, $splices, $nxt_prefix_amino, $prefix_nuc.$splice ); next SPLICE1 unless @$nxt; unshift @$_, $splice for @$nxt; push @results, @$nxt; } return \@results; } $prefix_amino = substr($target, 0, 1, ''); SPLICE2: for my $splice ( @{$first{$prefix_amino}} ) { my $nxt = $lib->_build( $target, $glue, $splices, $prefix_amino, $splice ); next SPLICE2 unless @$nxt; unshift @$_, $splice for @$nxt; push @results, @$nxt; } return \@results; } package main; use strict; use warnings; my $glues = 7; my $splices = 2; my $target = "ETIWITWIKNGFMTR"; my @snips = qw( AACGGA AC ACCCGA CA CGATGTTGGATTCCAGATTATGCTA GAGACGATT GAGAGACCAGAGGACCCTCGA GAAACGATGAGGGGAATAA GCGGGTT GG GGTACTG TG TGTTTG TGGATA TTGGATTAAA ); my $lib = DNA::Stitch->new(@snips); print "@$_\n" for @{ $lib->build($target, $glues, $splices) };

In reply to Amino acid sequence builder by runrig

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Titles consisting of a single word are discouraged, and in most cases are disallowed outright.
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and all is quiet...

    How do I use this? | Other CB clients
    Other Users?
    Others rifling through the Monastery: (3)
    As of 2017-10-17 02:37 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?
      My fridge is mostly full of:

















      Results (218 votes). Check out past polls.

      Notices?