Some comments on the benchmark and results obtained.
The benchmark code. CLI parameters are: -N=nn numbers of strings to generate; -LENGTH=mm: length of common substrings to look for; Interesting challenge Limbic~Region, thanks.
#! perl -slw
use strict;
use Benchmark qw[ cmpthese ];
sub nCommonSubstrLenL {
my( $haystack, $needle, $len ) = @_;
( $haystack, $needle ) = ( $needle, $haystack ) if length( $haysta
+ck ) < length( $needle );
my $count = 0;
my %possibles;
for my $ni ( 0 .. length( $needle ) - $len ) {
next if ++$possibles{ substr( $needle, $ni, $len ) } > 1;
++$count if 1+index $haystack, substr( $needle, $ni, $len );
}
return $count;
}
{
my %seen;
sub LR_common_substr {
my ($str1, $str2, $len_subs) = @_;
my ( $len_str1, $len_str2 ) = map length, $str1, $str2;
my $temp1 = exists $seen{$len_str1}{$len_subs}
? $seen{$len_str1}{$len_subs}
: ($seen{$len_str1}{$len_subs} = ("a${len_subs}X" .
+($len_subs - 1)) x ($len_str1 - $len_subs + 1));
my %substr;
@substr{ unpack($temp1, $str1) } = ();
my $temp2 = exists $seen{$len_str2}{$len_subs}
? $seen{$len_str2}{$len_subs}
: ($seen{$len_str2}{$len_subs} = ("a${len_subs}X" .
+($len_subs - 1)) x ($len_str2 - $len_subs + 1));
my $count = keys %substr;
delete @substr{ unpack($temp1, $str2) };
return $count - keys %substr;
}
}
{
my @matches;
my $push = qr/(?{ push @matches, $1 })/;
sub match_all_ways1 {
my ($string, $regex) = @_;
@matches = ();
$string =~ m/($regex)$push(?!)/;
return @matches;
}
}
sub BH_common_substr1 {
my ($str1, $str2, $len) = @_;
my %substr = map { $_ => 1 } match_all_ways1($str1 => qr/.{$len}/)
+;
$substr{$_} |= 2 for match_all_ways1($str2 => qr/.{$len}/);
return grep { $substr{$_} == 3 } keys %substr;
}
{
my @matches;
my $push = qr/(?{ push @matches, $1 })/;
sub match_all_ways2 {
my ($string, $regex) = @_;
@matches = ();
$string =~ m/$regex$push(?!)/;
return @matches;
}
}
sub BH_common_substr2 {
my ($str1, $str2, $len) = @_;
my %subs;
@subs{ match_all_ways2("$str1\0$str2" => qr/(.{$len}).*\0.*\1/)
+} = ();
return keys %subs;
}
{
my $hits = 0;
sub TZ_common_substr1 {
my($s1, $s2) = @_;
# print qq(s1 = $s1, s2 = $s2\n);
($s1, $s2) = ($s2, $s1) if length($s2) < length($s1);
if ($s1 eq $s2) {
$hits++;
return if length($s1) == 1;
}
my %hash = map { $_ => 1 } split(//, $s1);
my $arr = [];
for my $s (split(//, $s2)) {
push(@$arr, $s) if ! exists($hash{$s});
}
my $splitters = join('|', @$arr);
for my $s (split(/$splitters/, $s2)) {
TZ_common_substr1($s, $s1);
}
}
sub TZ_common_substr{ &TZ_common_substr1; return $hits }
}
sub Eric_common_sub {
my ($s1,$s2,$len) = @_;
my $len_s1 = length($s1);
my $len_s2 = length($s2);
my $match_s1 = {};
my $match_s2 = {};
for my $start (0..length($s1)-1) {
for my $l (1..$len) {
next if $start+$l > $len_s1;
$match_s1->{substr($s1, $start, $l)} ||= 1;
}
}
for my $start (0..length($s2)-1) {
for my $l (1..$len) {
next if $start+$l > $len_s2;
$match_s2->{substr($s2, $start, $l)} ||= 1;
}
}
$match_s1->{$_}++ for keys %$match_s2;
return grep { $match_s1->{$_} == 2 } keys %$match_s1;
}
sub MN_CountSubstrings { # $string1, $string2, $substr_length
my $l = pop @_;
my %found = ();
my %match = ();
my $first = 2;
my $string1 = $_[0];
for my $string ( @_ ) {
$first --;
my $ls = length( $string );
my $limit = $ls - $l + 1;
for ( my $i = 0; $i < $limit; $i++ ) {
my $sbstr = substr( $string, $i, $l );
$first or defined ( $match{$sbstr} ) && next();
$found{ $sbstr }{ $string } ||= 1;
$first and next;;
defined ( $found{ $sbstr }{ $string1 } )
and $match{ $sbstr } = 1;
}
}
return scalar keys %match;
}
sub rndStr{ join'', @_[ map{ rand @_ } 1 .. shift ] }
our $N ||= 50;
our $LENGTH ||= 3;
our @strings = map{ rndStr int( 30 +rand 20 ), 'A' .. 'D' } 1 .. $N;
our %results;
my %tests = (
'L~R' => q[ $results{ 'L~R' } = LR_common_substr( @strings[ $_
+ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
Buk => q[ $results{ Buk } = nCommonSubstrLenL( @strings[ $_
+ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
BH1 => q[ $results{ BH1 } = BH_common_substr1( @strings[ $_
+ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
BH2 => q[ $results{ BH2 } = BH_common_substr2( @strings[ $_
+ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
TZ => q[ $results{ TZ } = TZ_common_substr( @strings[ $_
+ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
Eric => q[ $results{ Eric } = Eric_common_sub( @strings[ $_
+ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
MN => q[ $results{ MN } = MN_CountSubstrings( @strings[ $_
+ -1, $_ ], $LENGTH ) for 1 .. $#strings ],
);
cmpthese -1, \%tests;
print "\n----\n";
print "$_ => $results{ $_ }" for keys %results;