Beefy Boxes and Bandwidth Generously Provided by pair Networks
Perl-Sensitive Sunglasses
 
PerlMonks  

Comment on

( #3333=superdoc: print w/ replies, xml ) Need Help??

Another update fixing the last round of issues.

use strict; use warnings; use Time::HiRes; use List::Util qw(min max); use Math::Pari qw(divisors); =head Written =cut my $allLCS = 1; my $subStrSize = 2; # Determines minimum match length. Should be less +than half # the minimum interesting match length. The larger this value is the f +aster the # search runs. if (@ARGV == 0) { print "Finds longest matching substring between any pair of test s +trings\n"; print "in the given file. Pairs of lines are expected with the fir +st of a\n"; print "pair being the string name and the second the test string." +; exit (1); } print "Minimum match length is $subStrSize\n"; my @strings; # Outside the loop so subs see it while (@ARGV) {# process each file # Read in the strings my $filename = shift; print "\nProcessing: $filename\n"; @strings = (); open inFile, "< $filename"; while (<inFile>) { chomp; my $strName = $_; $_ = <inFile>; chomp; push @strings, [$strName, $_]; } close inFile; my $lastStr = @strings - 1; my %bestMatches = ('len' => 0); # Best match details my $longest = 0; # Best match length so far (unexpanded) my $startTime = [Time::HiRes::gettimeofday ()]; # Do the search for my $curStr (0..$lastStr) {# each string my ($sourceName, $source) = @{$strings[$curStr]}; my @subStrs = generatePatterns ($source); my $lastSub = @subStrs-1; for my $targetStr (($curStr+1)..$lastStr) {# each remaining string my ($targetName, $target) = @{$strings[$targetStr]}; my $targetLen = length $target; my $localLongest = 0; my @localBests = [(0, 0, 0, 0, 0)]; for my $i (0..$lastSub) { my $offset = 0; while ($offset < $targetLen) { $offset = index $target, $subStrs[$i][0], $offset; last if $offset < 0; my $matchStr2 = substr $target, $offset; my $slipage = 0; my $bestSlip = 0; my $matchLen = 0; my $first = 1; while ($first || $slipage < $subStrSize && $subStrs[ +$i][1] < $subStrSize) { my $matchStr1 = substr $source, $i * $subStrSize - + $slipage; ($matchStr1 ^ $matchStr2) =~ /^\0*/; if ($matchLen < $+[0]) { $bestSlip = $slipage; $matchLen = $+[0]; } $slipage += $subStrs[$i][1]; $first = 0; } next if $matchLen < $localLongest - $subStrSize + 1; $localLongest = $matchLen; my @test = ($curStr, $targetStr, $i * $subStrSize - +$bestSlip, $offset, $matchLen); @test = expandMatch (@test); my $dm = $test[4] - $localBests[-1][4]; @localBests = () if $dm > 0; push @localBests, [@test] if $dm >= 0; $offset = $test[3] + $test[4]; next if $test[4] < $longest; $longest = $test[4]; $dm = $longest - $bestMatches{'len'}; next if $dm < 0; %bestMatches = ('len' => $test[4]) if $dm > 0; $bestMatches{"$test[0],$test[1],$test[2],$test[3]"} += $test[4]; $bestMatches{'len'} = $test[4]; } continue {++$offset;} } next if ! $allLCS; if (! @localBests) { print "Didn't find LCS for $sourceName and $targetName\n +"; next; } for (@localBests) { my @curr = @$_; printf "%03d:%03d L[%4d] (%4d %4d)\n", $curr[0], $curr[1], $curr[4], $curr[2], $curr[3]; } } } print "Completed in " . Time::HiRes::tv_interval ($startTime) . "\n" +; my $len = $bestMatches{'len'}; for (keys %bestMatches) { next if $_ eq 'len'; my @curr = split ',', $_; printf "Best match: %s - %s. %d characters starting at %d and %d.\ +n", $strings[$curr[0]][0], $strings[$curr[1]][0], $len, $curr[2], $cur +r[3]; } } sub expandMatch { my ($index1, $index2, $str1Start, $str2Start, $matchLen) = @_; my $maxMatch = max (0, min ($str1Start, $subStrSize + 10, $str2Start)) +; my $matchStr1 = substr ($strings[$index1][1], $str1Start - $maxMatch, +$maxMatch); my $matchStr2 = substr ($strings[$index2][1], $str2Start - $maxMatch, +$maxMatch); ($matchStr1 ^ $matchStr2) =~ /\0*$/; my $adj = $+[0] - $-[0]; $matchLen += $adj; $str1Start -= $adj; $str2Start -= $adj; return ($index1, $index2, $str1Start, $str2Start, $matchLen); } sub generatePatterns { my @subStrs; my $source = shift; my %strs; for (my $i = 0; $i < (length $source) - $subStrSize + 1; $i += $subStr +Size) { my $substr = substr $source, $i, $subStrSize; my ($cycleLen, $str) = findCycle ($substr); push @subStrs, [$substr, $cycleLen]; } #push @subStrs, [$_, $strs{$_}] for keys %strs; return @subStrs; } sub findCycle { my $str = shift; my $copy = $str; my $cycleLen = 0; my $strLen = length ($copy); for (0..($strLen - 1)) { $copy .= substr $copy, 0, 1, ''; $cycleLen = $_ + 1; ($str ^ $copy) =~ /^\0*/; return wantarray ? ($cycleLen, substr $str, 0, $cycleLen) : $cycleLen if $+[0] == $strLen; } return wantarray ? ($strLen, $str) : $strLen; } sub findCycle_1 { my $str = shift; my $strLen = length $str; for ( @{ divisors( $strLen ) } ) { my $copy = $str; $copy .= substr( $copy, 0, $_, '' ); return wantarray ? ($_, substr $str, 0, $_) : $_ if $str eq $copy; } }

Perl is Huffman encoded by design.

In reply to Re^2: Fast common substring matching by GrandFather
in thread Fast common substring matching by GrandFather

Title:
Use:  <p> text here (a paragraph) </p>
and:  <code> code here </code>
to format your post; it's "PerlMonks-approved HTML":



  • Posts are HTML formatted. Put <p> </p> tags around your paragraphs. Put <code> </code> tags around your code and data!
  • Read Where should I post X? if you're not absolutely sure you're posting in the right place.
  • Please read these before you post! —
  • Posts may use any of the Perl Monks Approved HTML tags:
    a, abbr, b, big, blockquote, br, caption, center, col, colgroup, dd, del, div, dl, dt, em, font, h1, h2, h3, h4, h5, h6, hr, i, ins, li, ol, p, pre, readmore, small, span, spoiler, strike, strong, sub, sup, table, tbody, td, tfoot, th, thead, tr, tt, u, ul, wbr
  • You may need to use entities for some characters, as follows. (Exception: Within code tags, you can put the characters literally.)
            For:     Use:
    & &amp;
    < &lt;
    > &gt;
    [ &#91;
    ] &#93;
  • Link using PerlMonks shortcuts! What shortcuts can I use for linking?
  • See Writeup Formatting Tips and other pages linked from there for more info.
  • Log In?
    Username:
    Password:

    What's my password?
    Create A New User
    Chatterbox?
    and the web crawler heard nothing...

    How do I use this? | Other CB clients
    Other Users?
    Others lurking in the Monastery: (9)
    As of 2015-07-31 11:45 GMT
    Sections?
    Information?
    Find Nodes?
    Leftovers?
      Voting Booth?

      The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









      Results (276 votes), past polls