#! /usr/bin/perl use strict; use warnings; use Data::Dumper; # May appear an attribute more than once in a record? # Sorry this is not DRY, the attribute names are duplicated # here and in the parser regex. my %is_single = ( 'exon' => 1, 'gene_id' => 1, 'product_id' => 1, 'sno_rna' => 1, 'query_subject' => 0, 'gene_name' => 1, 'link' => 1, 'other' => 0, ); # You can use split, just match the null string # before the real match in a look-ahead. my @records = split /^(?=\d+$)/m, do { local $/; }; # An array of hash of something, one item / record. my @parsed_records; #my %sno_records; for (@records) { my %record; # You probably want to eliminate those ugly trailing spaces first # and then leave out the '\s*' parts just before '$'. my $re = qr{ (?: ^ (? \d+ ) \s* $ ) | (?: ^ GI:\s* (? \d+ ) \s* $ ) | (?: ^ NM_ (? \d+\.\d ) \s* $ ) | (?: ^ snoRNA\s+ (? .+ ) \s* $ ) | (?s: ^ (? Query .*? Sbjct .*? ) \s* $ ) | (?i: ^ (? Homo \s sapiens .* ) \s* $ ) | (?: ^ (? http://.* ) \s* $ ) | (?: ^ (? .+ ) \s* $ ) # Order of branches matters, leave (?) at the very end. }mx; while (m/$re/gc) { my ( $key ) = keys %+; my ( $val ) = values %+; # If a key can appear only once then simply store it. if ( $is_single{$key} ) { $record{$key} = $val; } # Else put it into an array. else { push @{ $record{$key} }, $val; } } # This @parsed_records is _not_ keyed by sno_rna, as it # seemed unnatural for me with the provided sample data. push @parsed_records, \%record; # But you can easily transform it to a data structure keyed by sno_rna # just uncomment the lines related to %sno_records. #push @{ $sno_records{ $record{sno_rna} } }, \%record; #delete $record{sno_rna}; } print Dumper( \@parsed_records ); #print Dumper( \%sno_records ); __DATA__