There must be some sub contained with all the bio Perl modules that does this, but the format is simple and here is a parser for you. I didn't parse the older format that starts with ;. But you can add that if needed. See
Wiki FASTA format. Blank lines between sequences are optional as well as ending a sequence with a '*';
#!/usr/bin/perl -w
use strict;
use Data::Dump qw(pp);
my %sequences;
my $line;
my $skip_read=0;
while ($skip_read or defined ($line = <DATA>) )
{
chomp $line;
my ($id) = $line =~ /^\>(\w+)/;
($skip_read, $line) = finish_record($id,\%sequences) if ( defined
+$id);
}
sub finish_record
{
my ($id, $seqHashRef) = @_;
my $line;
while (defined ($line = <DATA>) and
$line !~ m/^\s*$/ and $line !~ m/^\>/)
{
chomp $line;
if ($line =~ /\*$/)
{
$line =~ s/\*$//;
$seqHashRef->{$id}.= $line;
return 0;
}
$seqHashRef->{$id}.= $line;
}
print "$line\n" if defined $line;
return (1, $line) if (defined ($line) and $line =~ m/^\>/);
return 0;
}
print pp(\%sequences);
=prints
{
MCHU => "ADQLTEEQIAEFKEAFSLFDKDGDGT....",
gi => "LCLYTHIGRNIYYGSYLYSETWNTGI....",
}
=cut
__DATA__
>MCHU - Calmodulin - Human, rabbit, bovine, rat, and chicken
ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
DIDGDGQVNYEEFVQMMTAK*
>gi|5524211|gb|AAD44166.1| cytochrome b [Elephas maximus maximus]
LCLYTHIGRNIYYGSYLYSETWNTGIMLLLITMATAFMGYVLPWGQMSFWGATVITNLFSAIPYIGTNLV
GLMPFLHTSKHRSMMLRPLSQALFWTLTMDLLTLTWIGSQPVEYPYTIIGQMASILYFSIILAFLPIAGX
IENY