in reply to
find index of specific array value that occurs multiple times
Your problem is complicated by the use of spaces in the consensus lines. If we simply split on whitespace, then we would get an incorrect index/offset.
Working, tested code:
use strict;
use warnings;
push @ARGV, 'pm_962355_01.dat' if not @ARGV;
my @consensus_lines;
my $seq_start_column = 0;
while (<>) {
chomp;
die "This algorithm relies on spaces - no tabs allowed!" if /\t/;
if ( !$seq_start_column and /^(\d+\.\S+\s+)\S/ ) {
$seq_start_column = length $1;
}
next if /^CLUSTAL/; # Header row
next if !/\S/; # Blank rows
if ( /\*/ ) {
push @consensus_lines, $_;
}
}
if (!$seq_start_column) {
die "Failed to calculate start column for sequences";
}
my $consensus = join '', map { substr $_, $seq_start_column }
@consensus_lines;
# Just for debugging
use Data::Dumper; $Data::Dumper::Useqq = 1;
print Dumper $consensus;
my @indexes = 0..length($consensus);
my @index_c = grep { substr($consensus,$_,1) eq '*' } @indexes;
my @index_n = grep { substr($consensus,$_,1) ne '*' } @indexes;
# If positions are 0-based:
print " Conserved: ", join(',', @index_c), "\n";
print "Not conserved: ", join(',', @index_n), "\n";
# If positions are 1-based:
#my @offset_c = map { $_ + 1 } @index_c;
#my @offset_n = map { $_ + 1 } @index_n;
#print " Conserved: ", join(',', @offset_c), "\n";
#print "Not conserved: ", join(',', @offset_n), "\n";
Output:
$VAR1 = " :*** * .** *::***** **:::::**: ::*:*** :*****:*.**
+*******: .:* * **:.*.. *****.***:***:: ..*: **.* ****:*************
+";
Conserved: 9,10,11,14,19,20,22,25,26,27,28,29,31,32,38,39,44,46,47
+,48,51,52,53,54,55,57,59,60,61,62,63,64,65,66,67,72,74,78,79,82,86,87
+,88,89,90,92,93,94,96,97,98,104,107,108,110,112,113,114,115,117,118,1
+19,120,121,122,123,124,125,126,127,128,129
Not conserved: 0,1,2,3,4,5,6,7,8,12,13,15,16,17,18,21,23,24,30,33,34,3
+5,36,37,40,41,42,43,45,49,50,56,58,68,69,70,71,73,75,76,77,80,81,83,8
+4,85,91,95,99,100,101,102,103,105,106,109,111,116,130