Re^3: Algorithm Showdown: Fuzzy Matching (DFA.pm)

This version is my implementation of ysths idea. Like him I split the input strings into Fuzz+1 parts and then use something like Xor to count the difference. But i use a DFA to find all of the exact matches. This is a hybrid Perl/Inline::C solution, with the storage and memory management done pretty much entirely in Perl, and the search routine done in Inline::C. It is slow to prepare() but searches are typically extremely quick. Like ysths solution on degenerate datasets it performs as well/badly as Xor.pm but these cases would seem to be unusual. The larger the ratio of WordChars/Fuzz is the faster this algorithm will perform (and the more memory it will use.)

Updated:Patches applied. Fix searching strings with non alphabet chars in them. Make providing the alphabet optional. Fix error where a UV was being assigned a negative number. Minor C cleanup.

package Fuzzy::Matcher::DFA;
use strict;
use warnings;
use vars qw/ $VERSION $Typemap @ISA /;
use constant PACKTYPE=>"l!";
use Data::Dump::Streamer;
use Fuzzy::Matcher;
use Carp;

@ISA=qw(Fuzzy::Matcher);
$VERSION=0.01;

# Underlying idea by [ysth], DFA/Inline::C implementation by [demerphq
+]

BEGIN {
    # this is required by certain Perl versions
    $|       = 1;
    $Typemap = __FILE__.".typemap";
    my $typemap=<<'TYPEMAP';
long *        T_OPAQUEPTR
float *        T_OPAQUEPTR
UV *        T_OPAQUEPTR
IV *        T_OPAQUEPTR
TYPEMAP
    unless ( -e $Typemap and -s $Typemap==length($typemap) ) {
        open my $tm, ">", $Typemap or die "$Typemap:$!";
        print $tm $typemap;
        close $tm;
    }
}

# use Inline qw(Info Noisy);

use Inline C => Config =>
#    ENABLE => 'FORCE_BUILD',
#    DISABLE   => 'CLEAN_AFTER_BUILD',
    TYPEMAPS  => $Typemap,
    VERSION   => $VERSION,
    NAME      => __PACKAGE__,
  ;
use Inline 'C';


# create a new Fuzzy Matcher. This mostly deals with
# alphabet/character type issues. The trie we build
# needs this info. Uppercase and lowercase letters
# are treated as the same character.

sub _init {
  my ($self,$chars)=@_;
  if ($chars) {
    $self->_set_chars($chars);
  }
  return $self;
}


sub _set_chars {
  my ($self,$chars)=@_;

  $chars='ACGT' unless defined $chars;
  my @achars;
  my %hchars;
  $self->{charcount}=0;
  $self->{charlist}=[];
  for my $c (split //,($chars||'ACGT')) {
    next if $hchars{$c};
    push @{$self->{charlist}},$c;
    $hchars{$c}          = $self->{charcount};
    $achars[ord $c]      = $self->{charcount};
    $self->{charcount}++
  }


  @{$self}{'achars','bchars','hchars'}=(
    \@achars,
    pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..255
+),
    \%hchars
  );

  my $str="\0" x (1024*1024);
  $self->{trie}=\$str;
  $self->{size}=$self->{charcount}+2;
  $self->{nodes}=1;
  return $self;
}

# Associates an integer to a string in the trie.
# trie is a flat string of packed longs.
# Space allocated for the trie increases by doubling
# and starts at 1k
sub _trie_store {
  my ($self,$str,$integer)=@_;

  croak "Can't _trie_store() after prepare or conversion."
    if $self->{fuzzed} or $self->{filled_in};

  my ($trie,$data,$hchars,$size,$charc)=@{$self}{qw(trie data hchars s
+ize charcount)};

  my $pos=1;
  my @chars=split //,$str;
  if (@chars) {
    foreach my $char ( @chars ) {
      my $ofs=$pos+$hchars->{$char};
      my $v=unpack PACKTYPE,substr($$trie,$ofs*4,4);

      unless ($v) {
        $pos=$size;
        $size+=$charc+1;
        $self->{nodes}++;
        $self->{size}=$size;
        if ($size*4>length($$trie)) {
          $$trie.="\0" x length($$trie);
        }
        substr($$trie,$ofs*4,4)=pack PACKTYPE,$pos;
      } else {
        $pos=$v;
      }
    }
  }
  $self->{size}=$size;

  if ( my $idx=unpack PACKTYPE,substr($$trie, ($pos + $charc)*4,4) ) {
        substr($$trie, ($pos + $charc)*4,4)= pack PACKTYPE,$integer;
        return $integer
  } else {
        substr($$trie, ($pos + $charc)*4,4)=pack PACKTYPE,$integer;
        return $integer;
  }
}


# Recursive routine which fills in any 0 slots in the trie with the
# appropriate state. Note that this relies on the fact that
# we will never store strings that can completely overlap each other.
# Ie if ABBA and BB are valid strings and we search ABBA, we will not
# record that we matched both, only ABBA. This isnt a problem for
# the purposes at hand as all of our words will be the same length.
# A more complicated storage structure could facilitate the issue of
# overlaps.
# The Idea here is we recurse through the trie keeping "cursors" that
# represent where we would be if we backtracked at each new letter
# encountered. When we find an empty slot we use the state of the
# oldest valid cursor as the new state. This is much faster than
# the older version I used.
sub _fill_in {
  my ($self,$node,$path,@csrs)=@_;

  my ($trie,$charc,$list,$bchars)=@{$self}{qw(trie charcount charlist 
+bchars)};

  my @defer;
  #print "$node> $path\n";
  my $added=0;
  foreach my $char (0..$charc-1) {
    my @csrs=@csrs;
    @csrs=grep {
      $_=unpack PACKTYPE,substr($$trie,($_+$char)*4,4);
      $_
    } @csrs;
    #print "try: '$path$list->[$char]' (@csrs)\n";
    my $v=unpack PACKTYPE,substr($$trie,($node+$char)*4,4);
    if ($v) {
      $added+=$self->_fill_in($v,$path.$list->[$char],@csrs,1);
    } else {
      my ($state)=(@csrs,1);
      push @defer,[$char,$state]
    }
  }
  foreach my $defer (@defer) {
    my ($char,$state)=@$defer;
    substr($$trie,($node+$char)*4,4)=pack PACKTYPE,$state;
    #print "$path$list->[$char] FAIL ($state)\n";
    $added++;
  }
  return $added;
}

#
# Just a wrapper to _fill_in()
#
sub convert_trie_to_dfa {
  my $self=shift;

  croak "Can't convert twice."
    if $self->{filled_in};

  $self->{filled_in}=1;
  my $added=$self->_fill_in(1,"");
  print "Filled in $added elements\n"
    unless defined wantarray;
  return $added;
}



#
# Takes a word to fuzzy match, if the word is new stores it in
# a two different places. First in a packed string form,
# as a null terminated part of a larger string containing all of the
# words to match. Also does the "split into fuzz+1 parts" trick and
# builds an index showing that if a given part is identified what
# words should be checked in full and at what offset (from the positio
+n
# of the last character required to match the string) to do the match
# at.
# Once the fuzzy word list has been built a prepare_for_search()
# converts the interim structures into something usable by the Inline:
+:C
# code.

sub fuzz_store {
  my ($self,$str)=@_;

  croak "Can't fuzz_store() after prepare or conversion."
    if $self->{fuzzed} or $self->{filled_in};

  return if $self->{str_hash}{$str}++;

  unless ($self->{charlist}) {
    $self->{fuzzed_chars}{$_}++
      for split //,$str
  }

  my $id=++$self->{strings_stored};

  my $fuzz=$self->{fuzz};
  my $l=($self->{strlen}||=length($str));
  my $bl=$l+1;

  #print "Storing '$str' as $id\n";

  my $strings=($self->{strings}||=do {
    \(my $buff="\0" x (1024*1024));
  });


  if($id * $bl > length($$strings)) {
    $$strings.="\0" x length($$strings);
  }

  substr($$strings,($id-1)*$bl,$l)=$str;

  #unpack is used to split the word appropriatly
  my $upk=($self->{fuzzpack}||=do{
    my $size=int($l/($fuzz+1));
    $self->{max_part_size}=$size;
    "A$size" x ($fuzz+1)
  });

  my @parts=unpack $upk,$str;
  my $partshash=($self->{parts_hash}||={});
  my $pos=0;

  # we have the list of parts, now insert them
  # into the the interim HOH
  foreach my $p (0..$#parts) {
    #print "$parts[$p] ".(-$pos-length($parts[$p]))." $str\n";
    $partshash->{$parts[$p]}{-$pos-length($parts[$p])}.=pack PACKTYPE,
+$id;
    $pos+=length $parts[$p];
  }

}

#
# Converts interim structures into the final Trie and
# and related XOR indexes. After this routine has been
# called no more strings may be added to the object,
# and it is ready for doing the search
#
sub prepare {
  my $self=shift;

  unless ($self->{charlist}) {
    $self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}});
  }


  my $partshash=($self->{parts_hash}||={});
  my $parts_idx=pack PACKTYPE,0;

  #foreach my $part (sort keys %$partshash) {
  #  print "$part \n";
  #  foreach my $ofs (sort {$a<=>$b }keys %{$partshash->{$part}}) {
  #    print "$ofs : ",join " ",unpack(PACKTYPE."*",$partshash->{$part
+}{$ofs}),"\n";
  #  }
  #}

  while (my ($part,$hash)=each %$partshash) {
    my $long=length($parts_idx)/4;
    $parts_idx.=join "",(map { pack(PACKTYPE,$_).$hash->{$_} }
                        keys %$hash),
                       pack(PACKTYPE,0);
    $self->{parts_stored}++;
    $self->_trie_store($part,$long);
    delete $partshash->{$part};
  }
  $self->{partsidx}=\$parts_idx;

  my $filled=$self->convert_trie_to_dfa;

  $self->{veclen}=int($self->{strings_stored} / 8)+1;
  $self->{vector}="\0" x ($self->{veclen}*$self->{strlen}*2);
  # *2 for safe measure. Heck we can afford it.

  undef $self->{str_hash};
  $self->{fuzzed}=1; # XXX this should block other methods.
}

#
# This is a wrapper to the C search routine. It provides
# space for a bitvector based "check test" of the most
# recently tested words and offsets. The idea is that we
# only ever need to test a word once at a given offset.
# The problem is that we will get spurious dupes when
# there is little or no fuzz involved in the match. For
# fuzz=2 we will report the same string 3 times if there
# is an exact match. The bitvector allows is to keep
# track whether we have already check a strng at a given
# offset. This means we need a bitvector for each character
# in the searched words (not fragment length).

sub fuzz_search {
  my ($self,$str)=@_;

  croak "Can't search before prepare_for_search()"
    unless $self->{fuzzed};

  my $ary=_c_fuzz_search($str,
                         length($str),
                         $self->{strlen},
                         $self->{veclen},
                         $self->{vector},
                         ${$self->{trie}},
                         ${$self->{partsidx}},
                         ${$self->{strings}},
                         $self->{bchars},
                         $self->{charcount},
                         $self->{fuzz},1);

  $self->{NumFullChecks}=$Fuzzy::Matcher::DFA::NumFullChecks;

  return $ary;
}



#
# Some diagnostic stuff. Mostly for debugging
#

sub show {
  my $self=shift;
  my ($trie,$charc,$size,$strings_stored,$parts_stored,$nodes,$strings
+,$partsidx)
     = @{$self}{qw(trie charcount size strings_stored parts_stored nod
+es strings partsidx)};
  print "== Fuzzy Trie == [$trie] Chars: $charc\n".
        "Allocated Values: $size (".($size*4).
        " bytes) Uses:".length($$trie)." bytes\n".
        "Trie Nodes Allocated: $nodes\n".
        ($strings_stored||0)." Fuzzy Strings Stored.\n";

  print "Used ".length($$strings)." bytes to store strings\n";
  my $max=50;
  if ($partsidx) {
    print "Used ".length($$partsidx)." to store index\n";
    my @unpack=unpack PACKTYPE."*",$$partsidx;

    if (@unpack>$max) {
      my $count=splice @unpack,$max,-1;
      print join(" ",@unpack),"... [$count]\n";
    } else {
      print join(" ",@unpack),"\n";
    }
  }

  print $self->{fuzzed} ? "Prepared " : "Not Prepared ",
        $self->{filled_in} ? "Converted " : "Not Converted ",
        "\n";
  my $ofs=1;
  while ($ofs<$size and $ofs<$max) {
      my $pos=$ofs;
      printf "%6d=%s\n",$pos,
             join(":",map {
                sprintf "%6d",
                        unpack PACKTYPE, substr($$trie,($ofs++)*4,4)
             } 0 .. $charc);
  }
  if ($ofs>$max) {
    print "(Truncated trie dump due to size)\n";
  }

}

1;
__DATA__
__C__
#undef trie_DEBUG
#undef trie_PACKED_RETURN
#ifdef trie_PACKED_RETURN

struct match_return {
  UV ofs;
  UV diff;
  UV match;
};
typedef struct match_return Match_Return;

#endif

/*

   search_string  - string to be searched
   search_len     - length of string being searched
   strlen  - length of matched word
   veclen  - length of a single vector string.
   seenvec - string of strlen*veclen bytes for the bitvector
   trie    - array of long. Value elements are expected to point
             into stridx.
   stridx  - index of offsets and words. Ofsets are always negative
             OFS, IDXPTR,[ IDXPTR...] , OFS, IDXPTR,[ IDXPTR...], 0
   strings - array of char holding all of the strings. stridx
             points into this structure via IDXPTR elements.
   charmap - convert ASCII char codes into indexes for the trie.
             Ie A -> 0 C -> 1, etc.
   charcount - How many chars are there in each node of the trie
   fuzz    - how many characters fuzz we want
   start   - start state. should always be 1.

   Returns an arrayref containing tripples for each match:
      OFS, Matches String, Difference

*/

AV* _c_fuzz_search(char *search_string,
                    UV search_len,
                    UV strlen,
                    UV veclen,
                    char *seenvec,
                    UV *trie,
                    IV *stridx,
                    char *strings,
                    char *charmap,
                    UV charcount,
                    UV fuzz,
                    UV start)
{
  UV ofs;
  UV cpos = 0;
  UV upto = search_len-strlen;
  UV bmaxlen = strlen+1;
  const unsigned char mask[8]={128,64,32,16,8,4,2,1};
  UV num_fullchecks=0;
  AV* return_array=newAV();


  start = start ? start : 1;
  ofs = start;
  #ifdef trie_DEBUG
  printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
  #endif

  while( search_string[cpos] && ofs ) {
    UV x;
    UV idx;
    char curchar;
    unsigned char curcharidx;

    #ifdef trie_DEBUG
    printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,st
+rlen,veclen);
    #endif

    Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
    curchar=search_string[cpos++];
    curcharidx=charmap[ curchar ];

    #ifdef trie_DEBUG
    printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,of
+s,curchar,curcharidx);
    #endif


    if (curcharidx==255) {
      #ifdef trie_DEBUG
      printf("Reset to start: %d (bad char)\n",ofs);
      #endif
      ofs = start;
      continue;
    }

    ofs = trie[ ofs + curcharidx ];

    #ifdef trie_DEBUG
    printf("New ofs: %d\n",ofs);
    #endif

    idx=trie[ofs+charcount];
    if (!idx)
      continue;

    #ifdef trie_DEBUG
    printf("Match idx: %d\n",idx);
    #endif

    while (stridx[idx]<0) {
      UV srch_ofs;
      IV sofs = stridx[idx++];

      if ((sofs + cpos<0)||(sofs + cpos > upto)) {
        while(stridx[idx]>0) idx++;
        continue;
      }

      srch_ofs = cpos + sofs;

      #ifdef trie_DEBUG
      printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
      #endif

      x= (srch_ofs % strlen)*veclen;

      while (stridx[idx]>0) {
        UV check_byte=x+(long)(stridx[idx]/8);
        unsigned char check_bit=stridx[idx] % 8;
        unsigned char seen=mask[check_bit] & seenvec[check_byte];

        #ifdef trie_DEBUG
        printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n",
                 stridx[idx],check_byte,check_bit,seen);
        #endif

        if (!seen) {
          char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
          char *s= &search_string[ srch_ofs ];
          UV diff=0;
          num_fullchecks++;
          seenvec[check_byte]=seenvec[check_byte] | mask[check_bit];


          #ifdef trie_DEBUG
          printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n",s
+rch_ofs,stridx[idx],cpos);
          printf("\t> C '%s' | S %s = ",c,s);
          #endif

          while (*c && diff<=fuzz) {
            if (*c++ != *s++) diff++;
          }

          #ifdef trie_DEBUG
          printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
          #endif

          if (diff <= fuzz) {
#ifdef trie_PACKED_RETURN
            {
              Match_Return ret;
#ifdef HAS_HTONL
              ret.ofs=PerlSock_htonl(ofs);
              ret.diff=PerlSock_htonl(diff);
              ret.match=PerlSock_htonl((stridx[idx]-1));
#else
              ret.ofs=ofs;
              ret.diff=diff;
              ret.match=(stridx[idx]-1);
#endif
              {
                SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
                av_push(return_array,sv);
              }

            }
#else
            {
              SV *sv=newSViv((UV)(srch_ofs));
              av_push(return_array,sv);
            }
            {
              SV *sv=newSViv((IV)diff);
              av_push(return_array,sv);
            }
            {
              SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0);
              av_push(return_array,sv);
            }
#endif
          }
        }
        idx++;
      }
    }
  }
  {
    SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks",
+TRUE);
    sv_setuv(num_fullchecks_sv,num_fullchecks);
  }
  return return_array;
}
[download]

---
demerphq

Comment on Re^3: Algorithm Showdown: Fuzzy Matching (DFA.pm) Download Code

Replies are listed 'Best First'.

Re^4: Algorithm Showdown: Fuzzy Matching (DFA.pm)
by demerphq (Chancellor) on Dec 11, 2004 at 11:36 UTC

Patch applied to fix the bug BrowserUk identified and also a C cleanup. (Updated Patch)

--- PostedDFA.pm    2004-12-11 12:31:19.714617600 +0100
+++ DFA.pm    2004-12-11 15:37:03.829067200 +0100
@@ -48,6 +48,15 @@
 
 sub _init {
   my ($self,$chars)=@_;
+  if ($chars) {
+    $self->_set_chars($chars);
+  }
+  return $self;
+}
+
+
+sub _set_chars {
+  my ($self,$chars)=@_;
 
   $chars='ACGT' unless defined $chars;
   my @achars;
@@ -55,18 +64,17 @@
   $self->{charcount}=0;
   $self->{charlist}=[];
   for my $c (split //,($chars||'ACGT')) {
-    next if $hchars{$c} || $hchars{lc($c)};
+    next if $hchars{$c};
     push @{$self->{charlist}},$c;
     $hchars{$c}          = $self->{charcount};
     $achars[ord $c]      = $self->{charcount};
-    $hchars{lc($c)}      = $self->{charcount};
-    $achars[ord(lc($c))] = $self->{charcount};
     $self->{charcount}++
   }
 
+
   @{$self}{'achars','bchars','hchars'}=(
     \@achars,
-    pack( "C*", map { defined $_ ? $_ : 255 } @achars),
+    pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..25
+5),
     \%hchars
   );
 
@@ -77,7 +85,6 @@
   return $self;
 }
 
-
 # Associates an integer to a string in the trie.
 # trie is a flat string of packed longs.
 # Space allocated for the trie increases by doubling
@@ -128,8 +135,9 @@
 # we will never store strings that can completely overlap each other.
 # Ie if ABBA and BB are valid strings and we search ABBA, we will not
 # record that we matched both, only ABBA. This isnt a problem for
-# the purposes at hand as all of our words will be most 1 character
-# different in lengths.
+# the purposes at hand as all of our words will be the same length.
+# A more complicated storage structure could facilitate the issue of
+# overlaps.
 # The Idea here is we recurse through the trie keeping "cursors" that
 # represent where we would be if we backtracked at each new letter
 # encountered. When we find an empty slot we use the state of the
@@ -206,6 +214,11 @@
 
   return if $self->{str_hash}{$str}++;
 
+  unless ($self->{charlist}) {
+    $self->{fuzzed_chars}{$_}++
+      for split //,$str
+  }
+
   my $id=++$self->{strings_stored};
 
   my $fuzz=$self->{fuzz};
@@ -254,6 +267,12 @@
 #
 sub prepare {
   my $self=shift;
+
+  unless ($self->{charlist}) {
+    $self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}});
+  }
+
+
   my $partshash=($self->{parts_hash}||={});
   my $parts_idx=pack PACKTYPE,0;
 
@@ -336,19 +355,26 @@
         " bytes) Uses:".length($$trie)." bytes\n".
         "Trie Nodes Allocated: $nodes\n".
         ($strings_stored||0)." Fuzzy Strings Stored.\n";
-  #print Dump($strings);
+
   print "Used ".length($$strings)." bytes to store strings\n";
+  my $max=50;
   if ($partsidx) {
     print "Used ".length($$partsidx)." to store index\n";
-    #my @unpack=unpack PACKTYPE."*",$$partsidx;
-    #print "@unpack\n";
+    my @unpack=unpack PACKTYPE."*",$$partsidx;
+
+    if (@unpack>$max) {
+      my $count=splice @unpack,$max,-1;
+      print join(" ",@unpack),"... [$count]\n";
+    } else {
+      print join(" ",@unpack),"\n";
+    }
   }
 
   print $self->{fuzzed} ? "Prepared " : "Not Prepared ",
         $self->{filled_in} ? "Converted " : "Not Converted ",
         "\n";
   my $ofs=1;
-  while ($ofs<$size and $ofs<1000) {
+  while ($ofs<$size and $ofs<$max) {
       my $pos=$ofs;
       printf "%6d=%s\n",$pos,
              join(":",map {
@@ -356,7 +382,7 @@
                         unpack PACKTYPE, substr($$trie,($ofs++)*4,4)
              } 0 .. $charc);
   }
-  if ($ofs>1000) {
+  if ($ofs>$max) {
     print "(Truncated trie dump due to size)\n";
   }
 
@@ -413,147 +439,155 @@
                     char *charmap,
                     UV charcount,
                     UV fuzz,
-                    UV start) {
-    UV ofs;
-    UV cpos = 0;
-    UV upto = search_len-strlen;
-    UV bmaxlen = strlen+1;
-    const unsigned char mask[8]={128,64,32,16,8,4,2,1};
-    UV num_fullchecks=0;
-    AV* return_array=newAV();
+                    UV start)
+{
+  UV ofs;
+  UV cpos = 0;
+  UV upto = search_len-strlen;
+  UV bmaxlen = strlen+1;
+  const unsigned char mask[8]={128,64,32,16,8,4,2,1};
+  UV num_fullchecks=0;
+  AV* return_array=newAV();
 
 
-    start = start ? start : 1;
-    ofs = start;
+  start = start ? start : 1;
+  ofs = start;
+  #ifdef trie_DEBUG
+  printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
+  #endif
+
+  while( search_string[cpos] && ofs ) {
+    UV x;
+    UV idx;
+    char curchar;
+    unsigned char curcharidx;
+
     #ifdef trie_DEBUG
-      printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
+    printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,s
+trlen,veclen);
     #endif
 
-    while( search_string[cpos] && ofs ) {
-        UV x;
-        char c;
-
-        #ifdef trie_DEBUG
-          printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,
+cpos,strlen,veclen);
-        #endif
+    Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
+    curchar=search_string[cpos++];
+    curcharidx=charmap[ curchar ];
 
-        Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
-        /*
-        for (x=0;x<veclen;x++)
-          seenvec[(cpos % strlen)*veclen+x]=0;
-        */
+    #ifdef trie_DEBUG
+    printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,o
+fs,curchar,curcharidx);
+    #endif
 
 
-        #ifdef trie_DEBUG
-          printf("Pos: %d -- State: %d -- Char: %c\n",cpos,ofs,search
+_string[cpos]);
-        #endif
+    if (curcharidx==255) {
+      #ifdef trie_DEBUG
+      printf("Reset to start: %d (bad char)\n",ofs);
+      #endif
+      ofs = start;
+      continue;
+    }
 
-        c=charmap[ search_string[cpos++] ];
-        if (c==255) {
-          ofs = start;
-        } else  {
-          ofs = trie[ ofs + c ];
+    ofs = trie[ ofs + curcharidx ];
 
-          #ifdef trie_DEBUG
-            printf("ofs: %d\n",ofs);
-          #endif
+    #ifdef trie_DEBUG
+    printf("New ofs: %d\n",ofs);
+    #endif
 
-          if (trie[ofs+charcount]) {
+    idx=trie[ofs+charcount];
+    if (!idx)
+      continue;
 
-            UV idx=trie[ofs+charcount];
+    #ifdef trie_DEBUG
+    printf("Match idx: %d\n",idx);
+    #endif
 
-            #ifdef trie_DEBUG
-              printf("Match idx: %d\n",idx);
-            #endif
+    while (stridx[idx]<0) {
+      UV srch_ofs;
+      IV sofs = stridx[idx++];
 
-            while (stridx[idx]) {
-              IV sofs = stridx[idx++];
-              UV srch_ofs = cpos + sofs;
+      if ((sofs + cpos<0)||(sofs + cpos > upto)) {
+        while(stridx[idx]>0) idx++;
+        continue;
+      }
 
-              #ifdef trie_DEBUG
-                printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
-              #endif
+      srch_ofs = cpos + sofs;
 
-              x= (srch_ofs % strlen)*veclen;
+      #ifdef trie_DEBUG
+      printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
+      #endif
 
-              while (stridx[idx]>0) {
+      x= (srch_ofs % strlen)*veclen;
 
-                if ( srch_ofs>=0 && srch_ofs<=upto ) {
-                  UV check_byte=x+(long)(stridx[idx]/8);
-                  unsigned char check_bit=stridx[idx] % 8;
-                  unsigned char seen=mask[check_bit] & seenvec[check_
+byte];
+      while (stridx[idx]>0) {
+        UV check_byte=x+(long)(stridx[idx]/8);
+        unsigned char check_bit=stridx[idx] % 8;
+        unsigned char seen=mask[check_bit] & seenvec[check_byte];
 
-                  #ifdef trie_DEBUG
-                    printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d
+\n",
-                           stridx[idx],check_byte,check_bit,seen);
-                  #endif
+        #ifdef trie_DEBUG
+        printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n",
+                 stridx[idx],check_byte,check_bit,seen);
+        #endif
 
-                  if (!seen) {
-                    char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
-                    char *s= &search_string[ srch_ofs ];
-                    UV diff=0;
-                    num_fullchecks++;
-                    seenvec[check_byte]=seenvec[check_byte] | mask[ch
+eck_bit];
+        if (!seen) {
+          char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
+          char *s= &search_string[ srch_ofs ];
+          UV diff=0;
+          num_fullchecks++;
+          seenvec[check_byte]=seenvec[check_byte] | mask[check_bit];
 
 
-                    #ifdef trie_DEBUG
-                      printf("\tsearching at ofs %d / strings_of %d (
+cpos:%d)\n",srch_ofs,stridx[idx],cpos);
-                      printf("\t> C '%s' | S %s = ",c,s);
-                    #endif
+          #ifdef trie_DEBUG
+          printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n",
+srch_ofs,stridx[idx],cpos);
+          printf("\t> C '%s' | S %s = ",c,s);
+          #endif
 
-                    while (*c && diff<=fuzz) {
-                      if (*c++ != *s++) diff++;
-                    }
+          while (*c && diff<=fuzz) {
+            if (*c++ != *s++) diff++;
+          }
 
-                    #ifdef trie_DEBUG
-                      printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
-                    #endif
+          #ifdef trie_DEBUG
+          printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
+          #endif
 
-                    if (diff <= fuzz) {
-  #ifdef trie_PACKED_RETURN
-                      {
-                        Match_Return ret;
-  #ifdef HAS_HTONL
-                        ret.ofs=PerlSock_htonl(ofs);
-                        ret.diff=PerlSock_htonl(diff);
-                        ret.match=PerlSock_htonl((stridx[idx]-1));
-  #else
-                        ret.ofs=ofs;
-                        ret.diff=diff;
-                        ret.match=(stridx[idx]-1);
-  #endif
-                        {
-                          SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
-                          av_push(return_array,sv);
-                        }
+          if (diff <= fuzz) {
+#ifdef trie_PACKED_RETURN
+            {
+              Match_Return ret;
+#ifdef HAS_HTONL
+              ret.ofs=PerlSock_htonl(ofs);
+              ret.diff=PerlSock_htonl(diff);
+              ret.match=PerlSock_htonl((stridx[idx]-1));
+#else
+              ret.ofs=ofs;
+              ret.diff=diff;
+              ret.match=(stridx[idx]-1);
+#endif
+              {
+                SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
+                av_push(return_array,sv);
+              }
 
-                      }
-  #else
-                      {
-                        SV *sv=newSViv((UV)(srch_ofs));
-                        av_push(return_array,sv);
-                      }
-                      {
-                        SV *sv=newSViv((IV)diff);
-                        av_push(return_array,sv);
-                      }
-                      {
-                        SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmax
+len],0);
-                        av_push(return_array,sv);
-                      }
-  #endif
-                    }
-                  }
-                }
-                idx++;
-              } /* each string */
-            } /* each ofs */
-          } /* do we have a hit */
-       } /* legal char? */
-    }   /* walk the string */
-    {
-      SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullCheck
+s",TRUE);
-      sv_setuv(num_fullchecks_sv,num_fullchecks);
+            }
+#else
+            {
+              SV *sv=newSViv((UV)(srch_ofs));
+              av_push(return_array,sv);
+            }
+            {
+              SV *sv=newSViv((IV)diff);
+              av_push(return_array,sv);
+            }
+            {
+              SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0);
+              av_push(return_array,sv);
+            }
+#endif
+          }
+        }
+        idx++;
+      }
     }
-    return return_array;
+  }
+  {
+    SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks"
+,TRUE);
+    sv_setuv(num_fullchecks_sv,num_fullchecks);
+  }
+  return return_array;
 }
+
[download]

---
demerphq

[reply]
[d/l]


Pathologically Eclectic Rubbish Lister
	PerlMonks