<?xml version="1.0" encoding="windows-1252"?>
<node id="111636" title="Re: Patter Finding" created="2001-09-11 00:17:47" updated="2005-07-21 01:28:12">
<type id="11">
note</type>
<author id="36133">
lemming</author>
<data>
<field name="doctext">
&lt;p&gt;&lt;b&gt;update:&lt;/b&gt; Look at my [id://112296|second] offering, it's better&lt;/p&gt;

&lt;p&gt;Here's an inefficient subroutine that I've used in
the past. Probably time for an overhaul&lt;br&gt;
Output when called with $string, 2, 2:&lt;br&gt;
4 : (hello)&lt;br&gt;
2 : (world)&lt;br&gt;
2 : (hi)
&lt;/p&gt;
&lt;code&gt;
sub get_pattern {
  my ($string, $min_len, $min_num) = @_;

  my $str_len = length($string);
  my $srch_max = int($str_len/2);
  my %patterns;
# First we find all patterns that are up to 1/2 the length of the string
  foreach my $len ($min_len..$srch_max) {
    my $eol = $str_len - $len;
    foreach my $ind1 (0..$eol) {
      my $pat = substr($file, $ind1, $len);
      unless ( defined($patterns{$pat}) ) {
        $patterns{$pat} = 0;
        my $index = 0;
        do {
          $index = index($file, $pat, $index);
          unless ($index &lt; 0) {
            $index += length($pat);
            $patterns{$pat}++;
          }
        } while ($index &gt;= 0);
      }
    }
  }
# We then dump all patterns that do not occur min_num times
  foreach my $key (keys %patterns) {
    delete $patterns{$key} if ($patterns{$key} &lt; $min_num);
  }
#  We then go through the patterns by order and remove those
#  that are invalidated by better patterns
  foreach my $key
      (sort { $patterns{$b} * (length($b)-1) &lt;=&gt;
       $patterns{$a} * (length($a)-1)
              or length($b) &lt;=&gt; length($a)
              or $a cmp $b } keys %patterns) {
      my $check = 0;
      $patterns{$key} = 0;
      my $index;
      do {
        $index = index($file, $key, 0);
        unless ($index &lt; 0) {
          $check = 1;
          $patterns{$key}++;
          substr($file, $index, length($key)) = "\000";
        }
      } while ($index &gt;= 0);
      delete $patterns{$key} if ($patterns{$key} &lt; $min_num);
    }
    foreach my $key
        (sort { $patterns{$b} * (length($b)-1) &lt;=&gt;
                $patterns{$a} * (length($a)-1)
              or length($b) &lt;=&gt; length($a)
              or $a cmp $b } keys %patterns) {
    (my $pat = $key) =~ s/\n/\\n/g;
    printf("%3d : (%s)\n", $patterns{$key}, $pat);
  }
}
&lt;/code&gt;</field>
<field name="root_node">
111621</field>
<field name="parent_node">
111621</field>
</data>
</node>
