#!/bin/perl-5.16.2/perl ##' A script to read in multiple FASTA entries in a single file, ##' and translate the DNA sequence into protein sequence. ##' I only plan on using a single reading frame starting from the beginning. use strict; use warnings; my $fasta_file=shift; my $protein = ''; my $codon; my $fh; open($fh, $fasta_file) or die "can't open $fasta_file: $!\n"; my %sequence_data; while (read_fasta_sequence($fh, \%sequence_data)) { #' Translate each three-base codon into amino acid, and append to a protein for(my $i=0; $i < (length($sequence_data{seq}) - 2) ; $i += 3) { $codon = substr($sequence_data{seq},$i,3); $protein .= codon2aa($codon); } print ">$sequence_data{header}\n$protein\n\n"; } sub read_fasta_sequence { my ($fh, $seq_info) = @_; $seq_info->{seq} = undef; # clear out previous sequence # put the header into place $seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_header}; my $file_not_empty = 0; while (<$fh>) { $file_not_empty = 1; next if /^\s*$/; # skip blank lines chomp; if (/^>/) { # fasta header line my $h = $_; $h =~ s/^>//; if ($seq_info->{header}) { $seq_info->{next_header} = $h; return $seq_info; } else { # first time through only $seq_info->{header} = $h; } } else { s/\s+//; # remove any white space $seq_info->{seq} .= $_; } } if ($file_not_empty) { return $seq_info; } else { # clean everything up $seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header} = undef; return; } } #' codon2aa #' #' A subroutine to translate a DNA 3-character codon to an amino acid sub codon2aa { my($codon) = @_; if ( $codon =~ /GC./i) { return 'A' } # Alanine elsif ( $codon =~ /TG[TC]/i) { return 'C' } # Cysteine elsif ( $codon =~ /GA[TC]/i) { return 'D' } # Aspartic Acid elsif ( $codon =~ /GA[AG]/i) { return 'E' } # Glutamic Acid elsif ( $codon =~ /TT[TC]/i) { return 'F' } # Phenylalanine elsif ( $codon =~ /GG./i) { return 'G' } # Glycine elsif ( $codon =~ /CA[TC]/i) { return 'H' } # Histidine elsif ( $codon =~ /AT[TCA]/i) { return 'I' } # Isoleucine elsif ( $codon =~ /AA[AG]/i) { return 'K' } # Lysine elsif ( $codon =~ /TT[AG]|CT./i) { return 'L' } # Leucine elsif ( $codon =~ /ATG/i) { return 'M' } # Methionine elsif ( $codon =~ /AA[TC]/i) { return 'N' } # Asparagine elsif ( $codon =~ /CC./i) { return 'P' } # Proline elsif ( $codon =~ /CA[AG]/i) { return 'Q' } # Glutamine elsif ( $codon =~ /CG.|AG[AG]/i) { return 'R' } # Arginine elsif ( $codon =~ /TC.|AG[TC]/i) { return 'S' } # Serine elsif ( $codon =~ /AC./i) { return 'T' } # Threonine elsif ( $codon =~ /GT./i) { return 'V' } # Valine elsif ( $codon =~ /TGG/i) { return 'W' } # Tryptophan elsif ( $codon =~ /TA[TC]/i) { return 'Y' } # Tyrosine elsif ( $codon =~ /TA[AG]|TGA/i) { return '_' } # Stop else { return '*'; } }