#! perl -slw use strict; use Benchmark qw[ cmpthese ]; sub nCommonSubstrLenL { my( $haystack, $needle, $len ) = @_; ( $haystack, $needle ) = ( $needle, $haystack ) if length( $haystack ) < length( $needle ); my $count = 0; my %possibles; for my $ni ( 0 .. length( $needle ) - $len ) { next if ++$possibles{ substr( $needle, $ni, $len ) } > 1; ++$count if 1+index $haystack, substr( $needle, $ni, $len ); } return $count; } { my %seen; sub LR_common_substr { my ($str1, $str2, $len_subs) = @_; my ( $len_str1, $len_str2 ) = map length, $str1, $str2; my $temp1 = exists $seen{$len_str1}{$len_subs} ? $seen{$len_str1}{$len_subs} : ($seen{$len_str1}{$len_subs} = ("a${len_subs}X" . ($len_subs - 1)) x ($len_str1 - $len_subs + 1)); my %substr; @substr{ unpack($temp1, $str1) } = (); my $temp2 = exists $seen{$len_str2}{$len_subs} ? $seen{$len_str2}{$len_subs} : ($seen{$len_str2}{$len_subs} = ("a${len_subs}X" . ($len_subs - 1)) x ($len_str2 - $len_subs + 1)); my $count = keys %substr; delete @substr{ unpack($temp1, $str2) }; return $count - keys %substr; } } { my @matches; my $push = qr/(?{ push @matches, $1 })/; sub match_all_ways1 { my ($string, $regex) = @_; @matches = (); $string =~ m/($regex)$push(?!)/; return @matches; } } sub BH_common_substr1 { my ($str1, $str2, $len) = @_; my %substr = map { $_ => 1 } match_all_ways1($str1 => qr/.{$len}/); $substr{$_} |= 2 for match_all_ways1($str2 => qr/.{$len}/); return grep { $substr{$_} == 3 } keys %substr; } { my @matches; my $push = qr/(?{ push @matches, $1 })/; sub match_all_ways2 { my ($string, $regex) = @_; @matches = (); $string =~ m/$regex$push(?!)/; return @matches; } } sub BH_common_substr2 { my ($str1, $str2, $len) = @_; my %subs; @subs{ match_all_ways2("$str1\0$str2" => qr/(.{$len}).*\0.*\1/) } = (); return keys %subs; } { my $hits = 0; sub TZ_common_substr1 { my($s1, $s2) = @_; # print qq(s1 = $s1, s2 = $s2\n); ($s1, $s2) = ($s2, $s1) if length($s2) < length($s1); if ($s1 eq $s2) { $hits++; return if length($s1) == 1; } my %hash = map { $_ => 1 } split(//, $s1); my $arr = []; for my $s (split(//, $s2)) { push(@$arr, $s) if ! exists($hash{$s}); } my $splitters = join('|', @$arr); for my $s (split(/$splitters/, $s2)) { TZ_common_substr1($s, $s1); } } sub TZ_common_substr{ &TZ_common_substr1; return $hits } } sub Eric_common_sub { my ($s1,$s2,$len) = @_; my $len_s1 = length($s1); my $len_s2 = length($s2); my $match_s1 = {}; my $match_s2 = {}; for my $start (0..length($s1)-1) { for my $l (1..$len) { next if $start+$l > $len_s1; $match_s1->{substr($s1, $start, $l)} ||= 1; } } for my $start (0..length($s2)-1) { for my $l (1..$len) { next if $start+$l > $len_s2; $match_s2->{substr($s2, $start, $l)} ||= 1; } } $match_s1->{$_}++ for keys %$match_s2; return grep { $match_s1->{$_} == 2 } keys %$match_s1; } sub MN_CountSubstrings { # $string1, $string2, $substr_length my $l = pop @_; my %found = (); my %match = (); my $first = 2; my $string1 = $_[0]; for my $string ( @_ ) { $first --; my $ls = length( $string ); my $limit = $ls - $l + 1; for ( my $i = 0; $i < $limit; $i++ ) { my $sbstr = substr( $string, $i, $l ); $first or defined ( $match{$sbstr} ) && next(); $found{ $sbstr }{ $string } ||= 1; $first and next;; defined ( $found{ $sbstr }{ $string1 } ) and $match{ $sbstr } = 1; } } return scalar keys %match; } sub rndStr{ join'', @_[ map{ rand @_ } 1 .. shift ] } our $N ||= 50; our $LENGTH ||= 3; our @strings = map{ rndStr int( 30 +rand 20 ), 'A' .. 'D' } 1 .. $N; our %results; my %tests = ( 'L~R' => q[ $results{ 'L~R' } = LR_common_substr( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], Buk => q[ $results{ Buk } = nCommonSubstrLenL( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], BH1 => q[ $results{ BH1 } = BH_common_substr1( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], BH2 => q[ $results{ BH2 } = BH_common_substr2( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], TZ => q[ $results{ TZ } = TZ_common_substr( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], Eric => q[ $results{ Eric } = Eric_common_sub( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], MN => q[ $results{ MN } = MN_CountSubstrings( @strings[ $_ -1, $_ ], $LENGTH ) for 1 .. $#strings ], ); cmpthese -1, \%tests; print "\n----\n"; print "$_ => $results{ $_ }" for keys %results;