It's not entirely clear to me what you're trying to do, but the
my @array=('A','T','C','G','AA','AT','AG', ..., 'CCCA','CCCT','CCCG','CCCC');
array and other code suggests you are iterating over every possible permutation of one through four of the A T C G bases and searching a string for and counting every occurrence of each of these substrings.
If so, there's an easier, probably faster, way. The following code can probably use more error checking, but as I don't know just what you're doing, I'll leave that to you. I also do not claim it's the fastest possible approach. HTH.
use warnings;
use strict;
use Test::More
# tests => ?? + 1 # Test::NoWarnings adds 1 test
'no_plan'
;
use Test::NoWarnings;
use Data::Dump;
use constant T => 'ACTGTACGCATAG';
VECTOR:
for my $ar_vector (
[ 1, T, { qw(A 4 T 3 C 3 G 3) }, ],
[ 2, T, { qw(A 4 T 3 C 3 G 3
AT 1 AC 2 AG 1 TA 2 TG 1 CA 1 CT 1 CG 1 GT 1 GC 1
) } ],
[ 3, T, { qw(A 4 T 3 C 3 G 3
AT 1 AC 2 AG 1 TA 2 TG 1 CA 1 CT 1 CG 1 GT 1 GC 1
ACT 1 CTG 1 TGT 1 GTA 1 TAC 1 ACG 1
CGC 1 GCA 1 CAT 1 ATA 1 TAG 1
) } ],
[ 4, T, { qw(A 4 T 3 C 3 G 3
AT 1 AC 2 AG 1 TA 2 TG 1 CA 1 CT 1 CG 1 GT 1 GC 1
ACT 1 CTG 1 TGT 1 GTA 1 TAC 1 ACG 1
CGC 1 GCA 1 CAT 1 ATA 1 TAG 1
ACTG 1 CTGT 1 TGTA 1 GTAC 1 TACG 1
ACGC 1 CGCA 1 GCAT 1 CATA 1 ATAG 1
) } ],
[ 5, T, { qw(A 4 T 3 C 3 G 3
AT 1 AC 2 AG 1 TA 2 TG 1 CA 1 CT 1 CG 1 GT 1 GC 1
ACT 1 CTG 1 TGT 1 GTA 1 TAC 1 ACG 1
CGC 1 GCA 1 CAT 1 ATA 1 TAG 1
ACTG 1 CTGT 1 TGTA 1 GTAC 1 TACG 1
ACGC 1 CGCA 1 GCAT 1 CATA 1 ATAG 1
ACTGT 1 CTGTA 1 TGTAC 1 GTACG 1 TACGC 1
ACGCA 1 CGCAT 1 GCATA 1 CATAG 1
) } ],
[ 12, T, { qw(A 4 T 3 C 3 G 3
AT 1 AC 2 AG 1 TA 2 TG 1 CA 1 CT 1 CG 1 GT 1 GC 1
ACT 1 CTG 1 TGT 1 GTA 1 TAC 1 ACG 1
CGC 1 GCA 1 CAT 1 ATA 1 TAG 1
ACTG 1 CTGT 1 TGTA 1 GTAC 1 TACG 1
ACGC 1 CGCA 1 GCAT 1 CATA 1 ATAG 1
ACTGT 1 CTGTA 1 TGTAC 1 GTACG 1 TACGC 1
ACGCA 1 CGCAT 1 GCATA 1 CATAG 1
ACTGTA 1 CTGTAC 1 TGTACG 1 GTACGC 1
TACGCA 1 ACGCAT 1 CGCATA 1 GCATAG 1
ACTGTAC 1 CTGTACG 1 TGTACGC 1 GTACGCA 1
TACGCAT 1 ACGCATA 1 CGCATAG 1
ACTGTACG 1 CTGTACGC 1 TGTACGCA 1
GTACGCAT 1 TACGCATA 1 ACGCATAG 1
ACTGTACGC 1 CTGTACGCA 1 TGTACGCAT 1 GTACGCATA 1 TACGCATAG 1
ACTGTACGCA 1 CTGTACGCAT 1 TGTACGCATA 1 GTACGCATAG 1
ACTGTACGCAT 1 CTGTACGCATA 1 TGTACGCATAG 1
ACTGTACGCATA 1 CTGTACGCATAG 1
) } ],
[ 13, T, { qw(A 4 T 3 C 3 G 3
AT 1 AC 2 AG 1 TA 2 TG 1 CA 1 CT 1 CG 1 GT 1 GC 1
ACT 1 CTG 1 TGT 1 GTA 1 TAC 1 ACG 1
CGC 1 GCA 1 CAT 1 ATA 1 TAG 1
ACTG 1 CTGT 1 TGTA 1 GTAC 1 TACG 1
ACGC 1 CGCA 1 GCAT 1 CATA 1 ATAG 1
ACTGT 1 CTGTA 1 TGTAC 1 GTACG 1 TACGC 1
ACGCA 1 CGCAT 1 GCATA 1 CATAG 1
ACTGTA 1 CTGTAC 1 TGTACG 1 GTACGC 1
TACGCA 1 ACGCAT 1 CGCATA 1 GCATAG 1
ACTGTAC 1 CTGTACG 1 TGTACGC 1 GTACGCA 1
TACGCAT 1 ACGCATA 1 CGCATAG 1
ACTGTACG 1 CTGTACGC 1 TGTACGCA 1
GTACGCAT 1 TACGCATA 1 ACGCATAG 1
ACTGTACGC 1 CTGTACGCA 1 TGTACGCAT 1 GTACGCATA 1 TACGCATAG 1
ACTGTACGCA 1 CTGTACGCAT 1 TGTACGCATA 1 GTACGCATAG 1
ACTGTACGCAT 1 CTGTACGCATA 1 TGTACGCATAG 1
ACTGTACGCATA 1 CTGTACGCATAG 1
ACTGTACGCATAG 1
) } ],
) {
if (not ref $ar_vector) { # embedded comment
note $ar_vector;
next VECTOR;
}
my ($max, $seq, $hr_expected) = @$ar_vector;
my %total;
process_nuc($max, $seq, \%total);
is_deeply \%total, $hr_expected,
qq{$max contiguous bases}
;
} # end for VECTOR
sub process_nuc {
my ($max, # max length of contiguous subsequences
$seq, # string: sequence in which to count subsequences
$hr_total, # hash ref.: returns subsequence totals
) = @_;
# data validation/sanity checking.
$max > 0 or die qq{max len ($max) <= 0};
# process main body of sequence.
for my $offset (0 .. (length($seq) - $max)) {
for my $len (1 .. $max) {
$hr_total->{ substr $seq, $offset, $len }++;
}
}
# process tail end (if any) of sequence not processed above.
--$max;
my $end = substr $seq, -$max;
for my $n (1 .. $max) {
for ($end =~ m{ (?= (.{$n})) }xmsg) {
$hr_total->{$_}++;
}
}
}