<?xml version="1.0" encoding="windows-1252"?>
<node id="613420" title="Krambambuli's scratchpad" created="2007-05-03 12:44:04" updated="2007-05-03 08:44:04">
<type id="182711">
scratchpad</type>
<author id="600591">
Krambambuli</author>
<data>
<field name="doctext">
Here's the benchmarking code - see below; I'm unsure how to handle it, as I would like to not let the response chain grow  too much, but there are still problems to solve till a final version should be published as a sum up (blazar - would you do that?).&lt;br&gt;&lt;br&gt;

Ikegami's new code is OK now, but Oha's, blazar's and lodin's  code not yet; when counting the strings found, I get &lt;br&gt;&lt;br&gt;
&lt;code&gt;
Number of substrings found:
   blazar       oha    kramba   ikegami     lodin
      416       326       737       737       416
&lt;/code&gt;

With these still flawed subs, benchmarking on my machine looks like&lt;br&gt;
&lt;code&gt;
Results for string:

"aacdabcabcecdecd aabcdabcabcecdecd aabcdabcabcecdecd aabcdabcabcecdecd  "

          Rate  blazar   lodin     oha ikegami  kramba
blazar  21.8/s      --    -84%    -86%    -88%    -89%
lodin    136/s    521%      --    -14%    -23%    -34%
oha      158/s    625%     17%      --    -10%    -23%
ikegami  175/s    704%     29%     11%      --    -15%
kramba   207/s    847%     52%     31%     18%      --
&lt;/code&gt;

I'd like to finish this up, but I'm unsure I would correct the buggy subs better than their authors would - so please, if possible, do help in getting this thread finished OK.&lt;br&gt;&lt;br&gt;
&lt;readmore&gt;
&lt;code&gt;
#!/usr/bin/perl
use strict;
use warnings;
use Data::Dumper;
use constant MIN_LENGTH =&gt; 2;
use constant MIN_REPEATS =&gt; 2;  # Must have at least this many repeats
use Benchmark qw/:all :hireswallclock/;

my $str='aacdabcabcecdecd aabcdabcabcecdecd aabcdabcabcecdecd aabcdabcabcecdecd ';
#my $str = &lt;&lt;EOT;
#Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt ut laoreet dolore magna aliquam erat volutpat. Ut wisi enim ad minim veniam, quis nostrud exerci tation ullamcorper suscipit lobortis nisl ut aliquip ex ea commodo consequat. Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi.
#EOT

sub blazar {
    local $_=shift;
    my $l=length;
    my %count;

    for my $off (0..$l-1) {
        for my $len (MIN_LENGTH .. $l-$off) {
            my $s = substr $_, $off, $len;
            $count{ $s } ||= ()= /$s/g;
        }
        $count{$_} &lt; MIN_REPEATS and
          delete $count{$_} for keys %count;
    }
    \%count;
}

sub oha {
    my $s=shift;
    my %saw;

    while($s =~ /(..+)(?=.*?\1)/g) {
        for my $x (0..length $1) {
            @saw{ map {substr $1, $x, $_} $x+2..length $1 } = ();
        }
    }
    $saw{$_} =()= $s =~ /\Q$_/g for keys %saw;
    \%saw;
}

sub ikegami {
    my ($str) = @_;

    local our %counts;

    $str =~ /(.{2,})(?{ ++$counts{$1} })(?!)/;
    delete @counts{ grep $counts{$_}&lt;MIN_REPEATS, keys %counts };

    return \%counts;
}

sub lodin {
    my ($str) = @_;
    #my $min_len = 2; # Substring is at least two chars long.
    #my $min_count = 3; # Substring occures at least three times.

    my $min_len   = MIN_LENGTH;
    my $min_count = MIN_REPEATS;
    local our %count;
    use re 'eval';
    $str =~ / (.{$min_len,})
              (?(?{ $count{$1} })
              (?!)
              )
              (?&gt;
                    .*?
                    \1
              ){@{[ $min_count - 2 ]}}
              .*
              \1
              (?{ ($count{$1} ||= $min_count-1)++ })
              (?!)
            /x;

    return \%count;
}

{
    my %count;

    sub kramba {
        my( $string) = @_;
        my $length = length( $string );

        if ($length &lt; MIN_LENGTH) {
            for (keys %count) {
                delete $count{$_}
                    if $count{$_} &lt; MIN_REPEATS;
            }
            return \%count;
        }

        for my $l (MIN_LENGTH..$length) {
            my $s = substr( $string, 0, $l );
            $count{ $s } += 1;
        }

        kramba( substr( $string, 1 ) );
    };
}

for my $multiplier (1) {
   my $work_str = "$str " x $multiplier;

my $x1 = blazar  $work_str;
my $x2 = oha     $work_str;
my $x3 = kramba  $work_str;
my $x4 = ikegami $work_str;
my $x5 = lodin   $work_str;

#print Dumper( [$x1, $x2, $x3, $x4, $x5] );

print "Number of substrings found: \n";
printf( "%9s ", $_ ) for ('blazar', 'oha', 'kramba', 'ikegami', 'lodin' );
print "\n";
printf( "%9s ", $_ ) for map { scalar keys %$_ } ($x1, $x2, $x3, $x4, $x5);
print "\n";

#exit;

for my $key (keys %$x3) {
    if (not exists $x4-&gt;{$key}) {
        print ("\"$key\": ", $x3-&gt;{$key}, "\n");

        my $index = index( $work_str, $key );
        if ($index &lt; 0) {
            print "\tNot found...!!\n";
        }
        else {
            print "\t$work_str\n",
                  "\t" . '.' x ($index) . $key . "\n";
        }
    }
}

print "Results for string:\n\n\"$work_str\"\n\n";

    cmpthese 500/$multiplier =&gt; {
        blazar  =&gt; sub { blazar $work_str },
        oha     =&gt; sub { oha $work_str },
        kramba  =&gt; sub { kramba $work_str },
        ikegami =&gt; sub { ikegami $work_str },
        lodin   =&gt; sub { lodin $work_str },
    }
}
                   
&lt;/code&gt;
&lt;/readmore&gt;

Thank you!&lt;br&gt;&lt;br&gt;






</field>
</data>
</node>
