use strict; use warnings; use Time::HiRes; use List::Util qw(min max); my $allLCS = 1; my $subStrSize = 8; # Determines minimum match length. Should be a power of 2 # and less than half the minimum interesting match length. The larger this value # the faster the search runs. if (@ARGV != 1) { print "Finds longest matching substring between any pair of test strings\n"; print "the given file. Pairs of lines are expected with the first of a\n"; print "pair being the string name and the second the test string."; exit (1); } # Read in the strings my @strings; while (<>) { chomp; my $strName = $_; $_ = <>; chomp; push @strings, [$strName, $_]; } my $lastStr = @strings - 1; my @bestMatches = [(0, 0, 0, 0, 0)]; # Best match details my $longest = 0; # Best match length so far (unexpanded) my $startTime = [Time::HiRes::gettimeofday ()]; # Do the search for (0..$lastStr) { my $curStr = $_; my @subStrs; my $source = $strings[$curStr][1]; my $sourceName = $strings[$curStr][0]; for (my $i = 0; $i < length $source; $i += $subStrSize) { push @subStrs, substr $source, $i, $subStrSize; } my $lastSub = @subStrs-1; for (($curStr+1)..$lastStr) { my $targetStr = $_; my $target = $strings[$_][1]; my $targetLen = length $target; my $targetName = $strings[$_][0]; my $localLongest = 0; my @localBests = [(0, 0, 0, 0, 0)]; for my $i (0..$lastSub) { my $offset = 0; while ($offset < $targetLen) { $offset = index $target, $subStrs[$i], $offset; last if $offset < 0; my $matchStr1 = substr $source, $i * $subStrSize; my $matchStr2 = substr $target, $offset; ($matchStr1 ^ $matchStr2) =~ /^\0*/; my $matchLen = $+[0]; next if $matchLen < $localLongest - $subStrSize + 1; $localLongest = $matchLen; my @test = ($curStr, $targetStr, $i * $subStrSize, $offset, $matchLen); @test = expandMatch (@test); my $dm = $test[4] - $localBests[-1][4]; @localBests = () if $dm > 0; push @localBests, [@test] if $dm >= 0; $offset = $test[3] + $test[4]; next if $test[4] < $longest; $longest = $test[4]; $dm = $longest - $bestMatches[-1][4]; next if $dm < 0; @bestMatches = () if $dm > 0; push @bestMatches, [@test]; } continue {++$offset;} } next if ! $allLCS; if (! @localBests) { print "Didn't find LCS for $sourceName and $targetName\n"; next; } for (@localBests) { my @curr = @$_; printf "%03d:%03d L[%4d] (%4d %4d)\n", $curr[0], $curr[1], $curr[4], $curr[2], $curr[3]; } } } print "Completed in " . Time::HiRes::tv_interval ($startTime) . "\n"; for (@bestMatches) { my @curr = @$_; printf "Best match: %s - %s. %d characters starting at %d and %d.\n", $strings[$curr[0]][0], $strings[$curr[1]][0], $curr[4], $curr[2], $curr[3]; } sub expandMatch { my ($index1, $index2, $str1Start, $str2Start, $matchLen) = @_; my $maxMatch = max (0, min ($str1Start, $subStrSize + 10, $str2Start)); my $matchStr1 = substr ($strings[$index1][1], $str1Start - $maxMatch, $maxMatch); my $matchStr2 = substr ($strings[$index2][1], $str2Start - $maxMatch, $maxMatch); ($matchStr1 ^ $matchStr2) =~ /\0*$/; my $adj = $+[0] - $-[0]; $matchLen += $adj; $str1Start -= $adj; $str2Start -= $adj; return ($index1, $index2, $str1Start, $str2Start, $matchLen); }