#!/bin/perl-5.16.2/perl
##' A script to read in multiple FASTA entries in a single file,
##'     and translate the DNA sequence into protein sequence.
##'     I only plan on using a single reading frame starting from the beginning.

use strict;
use warnings;

my $fasta_file=shift;
my $protein = '';
my $codon;
my $fh;
open($fh, $fasta_file) or die "can't open $fasta_file: $!\n";

my %sequence_data;
while (read_fasta_sequence($fh, \%sequence_data)) {
   #' Translate each three-base codon into amino acid, and append to a protein
   for(my $i=0; $i < (length($sequence_data{seq}) - 2) ; $i += 3) {
       $codon = substr($sequence_data{seq},$i,3);
       $protein .= codon2aa($codon);
    }
   
   print ">$sequence_data{header}\n$protein\n\n";
}

sub read_fasta_sequence {
   my ($fh, $seq_info) = @_;

   $seq_info->{seq} = undef; # clear out previous sequence

   # put the header into place
   $seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header};

   my $file_not_empty = 0; 
   while (<$fh>) {
      $file_not_empty = 1;
      next if /^\s*$/;  # skip blank lines
      chomp;    

      if (/^>/) { # fasta header line
         my $h = $_;    
         $h =~ s/^>//;  
         if ($seq_info->{header}) {
            $seq_info->{next_header} = $h;
            return $seq_info;   
         }              
         else { # first time through only
            $seq_info->{header} = $h;
         }              
      }         
      else {    
         s/\s+//;  # remove any white space
         $seq_info->{seq} .= $_;
      }         
   }    

   if ($file_not_empty) {
      return $seq_info;
   }    
   else {
      # clean everything up
      $seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef;

      return;   
   }    
}

#' codon2aa
#'
#' A subroutine to translate a DNA 3-character codon to an amino acid

sub codon2aa {
    my($codon) = @_;

       if ( $codon =~ /GC./i)		{ return 'A' }	# Alanine
    elsif ( $codon =~ /TG[TC]/i)	{ return 'C' }	# Cysteine
    elsif ( $codon =~ /GA[TC]/i)	{ return 'D' }	# Aspartic Acid
    elsif ( $codon =~ /GA[AG]/i)	{ return 'E' }	# Glutamic Acid
    elsif ( $codon =~ /TT[TC]/i)	{ return 'F' }	# Phenylalanine
    elsif ( $codon =~ /GG./i)		{ return 'G' }	# Glycine
    elsif ( $codon =~ /CA[TC]/i)	{ return 'H' }	# Histidine
    elsif ( $codon =~ /AT[TCA]/i)	{ return 'I' }	# Isoleucine
    elsif ( $codon =~ /AA[AG]/i)	{ return 'K' }	# Lysine
    elsif ( $codon =~ /TT[AG]|CT./i)	{ return 'L' }	# Leucine
    elsif ( $codon =~ /ATG/i)		{ return 'M' }	# Methionine
    elsif ( $codon =~ /AA[TC]/i)	{ return 'N' }	# Asparagine
    elsif ( $codon =~ /CC./i)		{ return 'P' }	# Proline
    elsif ( $codon =~ /CA[AG]/i)	{ return 'Q' }	# Glutamine
    elsif ( $codon =~ /CG.|AG[AG]/i)	{ return 'R' }	# Arginine
    elsif ( $codon =~ /TC.|AG[TC]/i)	{ return 'S' }	# Serine
    elsif ( $codon =~ /AC./i)		{ return 'T' }	# Threonine
    elsif ( $codon =~ /GT./i)		{ return 'V' }	# Valine
    elsif ( $codon =~ /TGG/i)		{ return 'W' }	# Tryptophan
    elsif ( $codon =~ /TA[TC]/i)	{ return 'Y' }	# Tyrosine
    elsif ( $codon =~ /TA[AG]|TGA/i)	{ return '_' }	# Stop
    else {
	return '*';
    }
}