sub filter4 { my ($ar_sequences, # ref. to array of sequences to be filtered ) = @_; # eliminate duplicate sequences of equal length. # a sequence may still exist as a substring in a longer sequence. my @from_uniq = uniq @$ar_sequences; # separator string. must be distinct from anything in a sequence. my $sep = ':'; # string of unique sequences, sorted shortest to longest. my $uniq_short_to_long = join $sep, sort { length($a) <=> length($b) } @from_uniq ; # joined string must end with a separator string. $uniq_short_to_long .= $sep; # some convenience regexes. my $base = qr{ [^\Q$sep\E] }xms; $sep = qr{ \Q$sep\E }xms; # convert to regex object # build index of offsets of first position beyond each length. my %offset; LENGTH_GROUP: while ($uniq_short_to_long =~ m{ \G ($base+) $sep }xmsg) { # build regex for sequences of this length. my $n_bases = length $1; my $n_seq = qr{ (?:$base){$n_bases} $sep }xms; # find, save offset of 1st longer seq after these sequences. $uniq_short_to_long =~ m{ \G $n_seq* }xmsg; $offset{ $n_bases } = pos $uniq_short_to_long; } # end while LENGTH_GROUP # keep all sequences NOT substrings of any LONGER sequence. return grep { $[ > index $uniq_short_to_long, $_, $offset{length $_}; } @from_uniq ; } # end sub filter4()