Instead of creating separate arrays like this
my @col1; ## column 1
my @col_ID; ## column 2
my @col3; ## column 3
you could use a single array of hashes. ( See
perldsc )
@AoH = (
{
col1 => "col1",
col_ID => "col_ID",
col3 => "col3",
},)
For example, something like this ;
#!/usr/bin/perl
use warnings;
use strict;
use Data::Dump 'pp';
my $inputfile1 = $ARGV[0];
my $outputfile = 'fasta';
#open IN, '<', $inputfile1
# or die "Uh oh.. unable to find file $inputfile1 : $!";
open OUT, '>>',$outputfile
or die "Could not open $outputfile : $!";
my @match;
while ( my $line = <DATA> ) { ;# use IN
chomp($line);
if( $line =~ m/splic/) {
my @colsplit = split /,/, $line; # use \t
my $record = {
'col3' => $colsplit[2],
'col1' => $colsplit[0],
'col_ID' => $colsplit[1],
'col_strand_direction' => $colsplit[5],
};
##pulls out + or - and subsequent number and [base change]
if ( $record->{'col3'} =~ m/([+-]\d+)\w+(\[[ACTG]])/) {
$record->{'intron_from_boundary'} = $1;
$record->{'baseref'} = $2 ;
$record->{'offset'} = 13;
if ($record->{'col_strand_direction'} =~ /\+/){
$record->{'offset'} += $record->{'intron_from_boundary'} ;
} else {
$record->{'offset'} -= $record->{'intron_from_boundary'} ;
}
}
push @match,$record;
}
}
# show data structure
pp @match;
# need to take each intronmatch value
# and work out its position relative
# to intron/exon boundary
foreach my $rec (@match) {
my $offset = $rec->{'offset'};
my $string = substr($rec->{'col1'},$offset,20);
print "offset = $offset : $string\n";
print OUT '>' . $rec->{'col_ID'} . $string . "\n";
}
close OUT;
__DATA__
1col1abcdefghijklmnopqrstuvwxyz0123456789,1col_ID,+1col3[A],1col4,spli
+c,+
2col1abcdefghijklmnopqrstuvwxyz0123456789,2col_ID,-2col3[C],2col4,spli
+c,-
3col1abcdefghijklmnopqrstuvwxyz0123456789,3col_ID,+3col3[T],3col4,spli
+c,+
4col1abcdefghijklmnopqrstuvwxyz0123456789,4col_ID,-4col3[G],4col4,spli
+c,-
poj