use warnings; use strict; use List::MoreUtils qw(uniq); use Test::More 'no_plan'; use Test::NoWarnings; MAIN: { # main loop VECTOR: for my $ar_vector ( # test input expected output [ [qw(AGCT AGGT GG AGCT) ], [qw(AGCT AGGT)], ], [ [qw(A AA AAA AAAA AAAAA)], [qw(AAAAA)], ], [ [qw(AAAAA AAAA AAA AA A)], [qw(AAAAA)], ], [ [qw(ACGT CGTA GTAC TACG)], [qw(ACGT CGTA GTAC TACG)], ], [ [qw(A C G T T G C A T G C A G T A C)], [qw(A C G T)], ], [ [qw(A G AG AGCT AGGT CTAG N AAAAA TT CCC GG AGCT AGC GCT C T CT)], [qw(AGCT AGGT CTAG N AAAAA TT CCC)], ], [ [qw(A AG AGC AGCT AGCTA AGCTAG AGCTAGC AGCTAGCT AGCTAGCTA)], [qw(AGCTAGCTA)], ], [ [qw(AGCTAGCTA AGCTAGCT AGCTAGC AGCTAG AGCTA AGCT AGC AG A)], [qw(AGCTAGCTA)], ], ) { my ($ar_test, $ar_expect) = @$ar_vector; is_deeply [ filter($ar_test) ], $ar_expect; } # end for VECTOR } # end MAIN loop # subroutines ###################################################### sub filter { my ($ar_sequences, # ref. to array of sequences to be filtered ) = @_; # eliminate duplicate sequences of equal length. my @from_uniq = uniq @$ar_sequences; my $delim = ':'; # prepare filter for substring elimination. my ($uniq_longest_first) = map { qq{$delim$_$delim} } join $delim, sort { length($b) <=> length($a) } # longest strings first @from_uniq ; # eliminate sequences that are substrings of ANY other sequence. return grep { my $len = length $_; $uniq_longest_first =~ m{ $delim [^$delim]{$len} $delim # stop checking if == len | $_ (*COMMIT)(*FAIL) # MIS-match if any substring }xms; } @from_uniq ; }