Beefy Boxes and Bandwidth Generously Provided by pair Networks
Think about Loose Coupling
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??

Another update fixing the last round of issues.

use strict; use warnings; use Time::HiRes; use List::Util qw(min max); use Math::Pari qw(divisors); =head Written =cut my $allLCS = 1; my $subStrSize = 2; # Determines minimum match length. Should be less +than half # the minimum interesting match length. The larger this value is the f +aster the # search runs. if (@ARGV == 0) { print "Finds longest matching substring between any pair of test s +trings\n"; print "in the given file. Pairs of lines are expected with the fir +st of a\n"; print "pair being the string name and the second the test string." +; exit (1); } print "Minimum match length is $subStrSize\n"; my @strings; # Outside the loop so subs see it while (@ARGV) {# process each file # Read in the strings my $filename = shift; print "\nProcessing: $filename\n"; @strings = (); open inFile, "< $filename"; while (<inFile>) { chomp; my $strName = $_; $_ = <inFile>; chomp; push @strings, [$strName, $_]; } close inFile; my $lastStr = @strings - 1; my %bestMatches = ('len' => 0); # Best match details my $longest = 0; # Best match length so far (unexpanded) my $startTime = [Time::HiRes::gettimeofday ()]; # Do the search for my $curStr (0..$lastStr) {# each string my ($sourceName, $source) = @{$strings[$curStr]}; my @subStrs = generatePatterns ($source); my $lastSub = @subStrs-1; for my $targetStr (($curStr+1)..$lastStr) {# each remaining string my ($targetName, $target) = @{$strings[$targetStr]}; my $targetLen = length $target; my $localLongest = 0; my @localBests = [(0, 0, 0, 0, 0)]; for my $i (0..$lastSub) { my $offset = 0; while ($offset < $targetLen) { $offset = index $target, $subStrs[$i][0], $offset; last if $offset < 0; my $matchStr2 = substr $target, $offset; my $slipage = 0; my $bestSlip = 0; my $matchLen = 0; my $first = 1; while ($first || $slipage < $subStrSize && $subStrs[ +$i][1] < $subStrSize) { my $matchStr1 = substr $source, $i * $subStrSize - + $slipage; ($matchStr1 ^ $matchStr2) =~ /^\0*/; if ($matchLen < $+[0]) { $bestSlip = $slipage; $matchLen = $+[0]; } $slipage += $subStrs[$i][1]; $first = 0; } next if $matchLen < $localLongest - $subStrSize + 1; $localLongest = $matchLen; my @test = ($curStr, $targetStr, $i * $subStrSize - +$bestSlip, $offset, $matchLen); @test = expandMatch (@test); my $dm = $test[4] - $localBests[-1][4]; @localBests = () if $dm > 0; push @localBests, [@test] if $dm >= 0; $offset = $test[3] + $test[4]; next if $test[4] < $longest; $longest = $test[4]; $dm = $longest - $bestMatches{'len'}; next if $dm < 0; %bestMatches = ('len' => $test[4]) if $dm > 0; $bestMatches{"$test[0],$test[1],$test[2],$test[3]"} += $test[4]; $bestMatches{'len'} = $test[4]; } continue {++$offset;} } next if ! $allLCS; if (! @localBests) { print "Didn't find LCS for $sourceName and $targetName\n +"; next; } for (@localBests) { my @curr = @$_; printf "%03d:%03d L[%4d] (%4d %4d)\n", $curr[0], $curr[1], $curr[4], $curr[2], $curr[3]; } } } print "Completed in " . Time::HiRes::tv_interval ($startTime) . "\n" +; my $len = $bestMatches{'len'}; for (keys %bestMatches) { next if $_ eq 'len'; my @curr = split ',', $_; printf "Best match: %s - %s. %d characters starting at %d and %d.\ +n", $strings[$curr[0]][0], $strings[$curr[1]][0], $len, $curr[2], $cur +r[3]; } } sub expandMatch { my ($index1, $index2, $str1Start, $str2Start, $matchLen) = @_; my $maxMatch = max (0, min ($str1Start, $subStrSize + 10, $str2Start)) +; my $matchStr1 = substr ($strings[$index1][1], $str1Start - $maxMatch, +$maxMatch); my $matchStr2 = substr ($strings[$index2][1], $str2Start - $maxMatch, +$maxMatch); ($matchStr1 ^ $matchStr2) =~ /\0*$/; my $adj = $+[0] - $-[0]; $matchLen += $adj; $str1Start -= $adj; $str2Start -= $adj; return ($index1, $index2, $str1Start, $str2Start, $matchLen); } sub generatePatterns { my @subStrs; my $source = shift; my %strs; for (my $i = 0; $i < (length $source) - $subStrSize + 1; $i += $subStr +Size) { my $substr = substr $source, $i, $subStrSize; my ($cycleLen, $str) = findCycle ($substr); push @subStrs, [$substr, $cycleLen]; } #push @subStrs, [$_, $strs{$_}] for keys %strs; return @subStrs; } sub findCycle { my $str = shift; my $copy = $str; my $cycleLen = 0; my $strLen = length ($copy); for (0..($strLen - 1)) { $copy .= substr $copy, 0, 1, ''; $cycleLen = $_ + 1; ($str ^ $copy) =~ /^\0*/; return wantarray ? ($cycleLen, substr $str, 0, $cycleLen) : $cycleLen if $+[0] == $strLen; } return wantarray ? ($strLen, $str) : $strLen; } sub findCycle_1 { my $str = shift; my $strLen = length $str; for ( @{ divisors( $strLen ) } ) { my $copy = $str; $copy .= substr( $copy, 0, $_, '' ); return wantarray ? ($_, substr $str, 0, $_) : $_ if $str eq $copy; } }

Perl is Huffman encoded by design.

In reply to Re^2: Fast common substring matching by GrandFather
in thread Fast common substring matching by GrandFather

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • Outside of code tags, you may need to use entities for some characters:
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others having an uproarious good time at the Monastery: (3)
    As of 2014-07-26 11:35 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      My favorite superfluous repetitious redundant duplicative phrase is:









      Results (175 votes), past polls