Greetings, I'm new here so please be gentle.
I've adapted code to deal with turning DNA sequence into amino acid sequence. I have to read through the original file and keep one line as a header and the next is the sequence itself. The sequence is read as each three nucleotides is a single amino acid codon. So I must append the next amino acid codon to the rest of the protein sequence. It works well up until it gets to the next entry. Then it appends the amino acid to the previous protein and continues building it out. I would like it to start over again for each entry sequence it reads.
The original file looks like this...
>M01096:4:000000000-A23M1:1:1101:15974:1529 1:N:0:16
GTTACTGAGTGTGGTTGGCAGACTGCTGGTTGCCGTATGTAT
>M01096:4:000000000-A23M1:1:1101:16525:1548 1:N:0:16
CTTTCTCATTGTGATGTTAAGGATTGGATGTGCTGGCTTCTG
>M01096:4:000000000-A23M1:1:1101:13838:1554 1:N:0:16
GCTTGGACTTGTGTTGAGATTGATGGTCATTTCTCTATGAAT
...
The output looks like this...
>M01096:4:000000000-A23M1:1:1101:15974:1529 1:N:0:16
VTECGWQTAGCRMY
>M01096:4:000000000-A23M1:1:1101:16525:1548 1:N:0:16
VTECGWQTAGCRMYLSHCDVKDWMCWLL
>M01096:4:000000000-A23M1:1:1101:13838:1554 1:N:0:16
VTECGWQTAGCRMYLSHCDVKDWMCWLLAWTCVEIDGHFSMN
...
And here's my code...
#!/bin/perl-5.16.2/perl
##' A script to read in multiple FASTA entries in a single file,
##' and translate the DNA sequence into protein sequence.
##' I only plan on using a single reading frame starting from the
+beginning.
use strict;
use warnings;
my $fasta_file=shift;
my $protein = '';
my $codon;
my $fh;
open($fh, $fasta_file) or die "can't open $fasta_file: $!\n";
my %sequence_data;
while (read_fasta_sequence($fh, \%sequence_data)) {
#' Translate each three-base codon into amino acid, and append to a
+ protein
for(my $i=0; $i < (length($sequence_data{seq}) - 2) ; $i += 3) {
$codon = substr($sequence_data{seq},$i,3);
$protein .= codon2aa($codon);
}
print ">$sequence_data{header}\n$protein\n\n";
}
sub read_fasta_sequence {
my ($fh, $seq_info) = @_;
$seq_info->{seq} = undef; # clear out previous sequence
# put the header into place
$seq_info->{header} = $seq_info->{next_header} if $seq_info->{next_
+header};
my $file_not_empty = 0;
while (<$fh>) {
$file_not_empty = 1;
next if /^\s*$/; # skip blank lines
chomp;
if (/^>/) { # fasta header line
my $h = $_;
$h =~ s/^>//;
if ($seq_info->{header}) {
$seq_info->{next_header} = $h;
return $seq_info;
}
else { # first time through only
$seq_info->{header} = $h;
}
}
else {
s/\s+//; # remove any white space
$seq_info->{seq} .= $_;
}
}
if ($file_not_empty) {
return $seq_info;
}
else {
# clean everything up
$seq_info->{header} = $seq_info->{seq} = $seq_info->{next_header
+} = undef;
return;
}
}
#' codon2aa
#'
#' A subroutine to translate a DNA 3-character codon to an amino acid
sub codon2aa {
my($codon) = @_;
if ( $codon =~ /GC./i) { return 'A' } # Alanine
elsif ( $codon =~ /TG[TC]/i) { return 'C' } # Cysteine
elsif ( $codon =~ /GA[TC]/i) { return 'D' } # Aspartic Acid
elsif ( $codon =~ /GA[AG]/i) { return 'E' } # Glutamic Acid
elsif ( $codon =~ /TT[TC]/i) { return 'F' } # Phenylalanine
elsif ( $codon =~ /GG./i) { return 'G' } # Glycine
elsif ( $codon =~ /CA[TC]/i) { return 'H' } # Histidine
elsif ( $codon =~ /AT[TCA]/i) { return 'I' } # Isoleucine
elsif ( $codon =~ /AA[AG]/i) { return 'K' } # Lysine
elsif ( $codon =~ /TT[AG]|CT./i) { return 'L' } # Leucine
elsif ( $codon =~ /ATG/i) { return 'M' } # Methionine
elsif ( $codon =~ /AA[TC]/i) { return 'N' } # Asparagine
elsif ( $codon =~ /CC./i) { return 'P' } # Proline
elsif ( $codon =~ /CA[AG]/i) { return 'Q' } # Glutamine
elsif ( $codon =~ /CG.|AG[AG]/i) { return 'R' } # Arginine
elsif ( $codon =~ /TC.|AG[TC]/i) { return 'S' } # Serine
elsif ( $codon =~ /AC./i) { return 'T' } # Threonine
elsif ( $codon =~ /GT./i) { return 'V' } # Valine
elsif ( $codon =~ /TGG/i) { return 'W' } # Tryptophan
elsif ( $codon =~ /TA[TC]/i) { return 'Y' } # Tyrosine
elsif ( $codon =~ /TA[AG]|TGA/i) { return '_' } # Stop
else {
return '*';
}
}
I think the fix should be easy, but it is eluding me. Thanks in advance for anyone's help.