in reply to Re^2: Algorithm Showdown: Fuzzy Matching (Matcher.pm) in thread Algorithm Showdown: Fuzzy Matching
This version is my implementation of ysths idea. Like him I split the input strings into Fuzz+1 parts and then use something like Xor to count the difference. But i use a DFA to find all of the exact matches. This is a hybrid Perl/Inline::C solution, with the storage and memory management done pretty much entirely in Perl, and the search routine done in Inline::C. It is slow to prepare() but searches are typically extremely quick. Like ysths solution on degenerate datasets it performs as well/badly as Xor.pm but these cases would seem to be unusual. The larger the ratio of WordChars/Fuzz is the faster this algorithm will perform (and the more memory it will use.)
Updated:Patches applied. Fix searching strings with non alphabet chars in them. Make providing the alphabet optional. Fix error where a UV was being assigned a negative number. Minor C cleanup.
package Fuzzy::Matcher::DFA;
use strict;
use warnings;
use vars qw/ $VERSION $Typemap @ISA /;
use constant PACKTYPE=>"l!";
use Data::Dump::Streamer;
use Fuzzy::Matcher;
use Carp;
@ISA=qw(Fuzzy::Matcher);
$VERSION=0.01;
# Underlying idea by [ysth], DFA/Inline::C implementation by [demerphq
+]
BEGIN {
# this is required by certain Perl versions
$| = 1;
$Typemap = __FILE__.".typemap";
my $typemap=<<'TYPEMAP';
long * T_OPAQUEPTR
float * T_OPAQUEPTR
UV * T_OPAQUEPTR
IV * T_OPAQUEPTR
TYPEMAP
unless ( -e $Typemap and -s $Typemap==length($typemap) ) {
open my $tm, ">", $Typemap or die "$Typemap:$!";
print $tm $typemap;
close $tm;
}
}
# use Inline qw(Info Noisy);
use Inline C => Config =>
# ENABLE => 'FORCE_BUILD',
# DISABLE => 'CLEAN_AFTER_BUILD',
TYPEMAPS => $Typemap,
VERSION => $VERSION,
NAME => __PACKAGE__,
;
use Inline 'C';
# create a new Fuzzy Matcher. This mostly deals with
# alphabet/character type issues. The trie we build
# needs this info. Uppercase and lowercase letters
# are treated as the same character.
sub _init {
my ($self,$chars)=@_;
if ($chars) {
$self->_set_chars($chars);
}
return $self;
}
sub _set_chars {
my ($self,$chars)=@_;
$chars='ACGT' unless defined $chars;
my @achars;
my %hchars;
$self->{charcount}=0;
$self->{charlist}=[];
for my $c (split //,($chars||'ACGT')) {
next if $hchars{$c};
push @{$self->{charlist}},$c;
$hchars{$c} = $self->{charcount};
$achars[ord $c] = $self->{charcount};
$self->{charcount}++
}
@{$self}{'achars','bchars','hchars'}=(
\@achars,
pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..255
+),
\%hchars
);
my $str="\0" x (1024*1024);
$self->{trie}=\$str;
$self->{size}=$self->{charcount}+2;
$self->{nodes}=1;
return $self;
}
# Associates an integer to a string in the trie.
# trie is a flat string of packed longs.
# Space allocated for the trie increases by doubling
# and starts at 1k
sub _trie_store {
my ($self,$str,$integer)=@_;
croak "Can't _trie_store() after prepare or conversion."
if $self->{fuzzed} or $self->{filled_in};
my ($trie,$data,$hchars,$size,$charc)=@{$self}{qw(trie data hchars s
+ize charcount)};
my $pos=1;
my @chars=split //,$str;
if (@chars) {
foreach my $char ( @chars ) {
my $ofs=$pos+$hchars->{$char};
my $v=unpack PACKTYPE,substr($$trie,$ofs*4,4);
unless ($v) {
$pos=$size;
$size+=$charc+1;
$self->{nodes}++;
$self->{size}=$size;
if ($size*4>length($$trie)) {
$$trie.="\0" x length($$trie);
}
substr($$trie,$ofs*4,4)=pack PACKTYPE,$pos;
} else {
$pos=$v;
}
}
}
$self->{size}=$size;
if ( my $idx=unpack PACKTYPE,substr($$trie, ($pos + $charc)*4,4) ) {
substr($$trie, ($pos + $charc)*4,4)= pack PACKTYPE,$integer;
return $integer
} else {
substr($$trie, ($pos + $charc)*4,4)=pack PACKTYPE,$integer;
return $integer;
}
}
# Recursive routine which fills in any 0 slots in the trie with the
# appropriate state. Note that this relies on the fact that
# we will never store strings that can completely overlap each other.
# Ie if ABBA and BB are valid strings and we search ABBA, we will not
# record that we matched both, only ABBA. This isnt a problem for
# the purposes at hand as all of our words will be the same length.
# A more complicated storage structure could facilitate the issue of
# overlaps.
# The Idea here is we recurse through the trie keeping "cursors" that
# represent where we would be if we backtracked at each new letter
# encountered. When we find an empty slot we use the state of the
# oldest valid cursor as the new state. This is much faster than
# the older version I used.
sub _fill_in {
my ($self,$node,$path,@csrs)=@_;
my ($trie,$charc,$list,$bchars)=@{$self}{qw(trie charcount charlist
+bchars)};
my @defer;
#print "$node> $path\n";
my $added=0;
foreach my $char (0..$charc-1) {
my @csrs=@csrs;
@csrs=grep {
$_=unpack PACKTYPE,substr($$trie,($_+$char)*4,4);
$_
} @csrs;
#print "try: '$path$list->[$char]' (@csrs)\n";
my $v=unpack PACKTYPE,substr($$trie,($node+$char)*4,4);
if ($v) {
$added+=$self->_fill_in($v,$path.$list->[$char],@csrs,1);
} else {
my ($state)=(@csrs,1);
push @defer,[$char,$state]
}
}
foreach my $defer (@defer) {
my ($char,$state)=@$defer;
substr($$trie,($node+$char)*4,4)=pack PACKTYPE,$state;
#print "$path$list->[$char] FAIL ($state)\n";
$added++;
}
return $added;
}
#
# Just a wrapper to _fill_in()
#
sub convert_trie_to_dfa {
my $self=shift;
croak "Can't convert twice."
if $self->{filled_in};
$self->{filled_in}=1;
my $added=$self->_fill_in(1,"");
print "Filled in $added elements\n"
unless defined wantarray;
return $added;
}
#
# Takes a word to fuzzy match, if the word is new stores it in
# a two different places. First in a packed string form,
# as a null terminated part of a larger string containing all of the
# words to match. Also does the "split into fuzz+1 parts" trick and
# builds an index showing that if a given part is identified what
# words should be checked in full and at what offset (from the positio
+n
# of the last character required to match the string) to do the match
# at.
# Once the fuzzy word list has been built a prepare_for_search()
# converts the interim structures into something usable by the Inline:
+:C
# code.
sub fuzz_store {
my ($self,$str)=@_;
croak "Can't fuzz_store() after prepare or conversion."
if $self->{fuzzed} or $self->{filled_in};
return if $self->{str_hash}{$str}++;
unless ($self->{charlist}) {
$self->{fuzzed_chars}{$_}++
for split //,$str
}
my $id=++$self->{strings_stored};
my $fuzz=$self->{fuzz};
my $l=($self->{strlen}||=length($str));
my $bl=$l+1;
#print "Storing '$str' as $id\n";
my $strings=($self->{strings}||=do {
\(my $buff="\0" x (1024*1024));
});
if($id * $bl > length($$strings)) {
$$strings.="\0" x length($$strings);
}
substr($$strings,($id-1)*$bl,$l)=$str;
#unpack is used to split the word appropriatly
my $upk=($self->{fuzzpack}||=do{
my $size=int($l/($fuzz+1));
$self->{max_part_size}=$size;
"A$size" x ($fuzz+1)
});
my @parts=unpack $upk,$str;
my $partshash=($self->{parts_hash}||={});
my $pos=0;
# we have the list of parts, now insert them
# into the the interim HOH
foreach my $p (0..$#parts) {
#print "$parts[$p] ".(-$pos-length($parts[$p]))." $str\n";
$partshash->{$parts[$p]}{-$pos-length($parts[$p])}.=pack PACKTYPE,
+$id;
$pos+=length $parts[$p];
}
}
#
# Converts interim structures into the final Trie and
# and related XOR indexes. After this routine has been
# called no more strings may be added to the object,
# and it is ready for doing the search
#
sub prepare {
my $self=shift;
unless ($self->{charlist}) {
$self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}});
}
my $partshash=($self->{parts_hash}||={});
my $parts_idx=pack PACKTYPE,0;
#foreach my $part (sort keys %$partshash) {
# print "$part \n";
# foreach my $ofs (sort {$a<=>$b }keys %{$partshash->{$part}}) {
# print "$ofs : ",join " ",unpack(PACKTYPE."*",$partshash->{$part
+}{$ofs}),"\n";
# }
#}
while (my ($part,$hash)=each %$partshash) {
my $long=length($parts_idx)/4;
$parts_idx.=join "",(map { pack(PACKTYPE,$_).$hash->{$_} }
keys %$hash),
pack(PACKTYPE,0);
$self->{parts_stored}++;
$self->_trie_store($part,$long);
delete $partshash->{$part};
}
$self->{partsidx}=\$parts_idx;
my $filled=$self->convert_trie_to_dfa;
$self->{veclen}=int($self->{strings_stored} / 8)+1;
$self->{vector}="\0" x ($self->{veclen}*$self->{strlen}*2);
# *2 for safe measure. Heck we can afford it.
undef $self->{str_hash};
$self->{fuzzed}=1; # XXX this should block other methods.
}
#
# This is a wrapper to the C search routine. It provides
# space for a bitvector based "check test" of the most
# recently tested words and offsets. The idea is that we
# only ever need to test a word once at a given offset.
# The problem is that we will get spurious dupes when
# there is little or no fuzz involved in the match. For
# fuzz=2 we will report the same string 3 times if there
# is an exact match. The bitvector allows is to keep
# track whether we have already check a strng at a given
# offset. This means we need a bitvector for each character
# in the searched words (not fragment length).
sub fuzz_search {
my ($self,$str)=@_;
croak "Can't search before prepare_for_search()"
unless $self->{fuzzed};
my $ary=_c_fuzz_search($str,
length($str),
$self->{strlen},
$self->{veclen},
$self->{vector},
${$self->{trie}},
${$self->{partsidx}},
${$self->{strings}},
$self->{bchars},
$self->{charcount},
$self->{fuzz},1);
$self->{NumFullChecks}=$Fuzzy::Matcher::DFA::NumFullChecks;
return $ary;
}
#
# Some diagnostic stuff. Mostly for debugging
#
sub show {
my $self=shift;
my ($trie,$charc,$size,$strings_stored,$parts_stored,$nodes,$strings
+,$partsidx)
= @{$self}{qw(trie charcount size strings_stored parts_stored nod
+es strings partsidx)};
print "== Fuzzy Trie == [$trie] Chars: $charc\n".
"Allocated Values: $size (".($size*4).
" bytes) Uses:".length($$trie)." bytes\n".
"Trie Nodes Allocated: $nodes\n".
($strings_stored||0)." Fuzzy Strings Stored.\n";
print "Used ".length($$strings)." bytes to store strings\n";
my $max=50;
if ($partsidx) {
print "Used ".length($$partsidx)." to store index\n";
my @unpack=unpack PACKTYPE."*",$$partsidx;
if (@unpack>$max) {
my $count=splice @unpack,$max,-1;
print join(" ",@unpack),"... [$count]\n";
} else {
print join(" ",@unpack),"\n";
}
}
print $self->{fuzzed} ? "Prepared " : "Not Prepared ",
$self->{filled_in} ? "Converted " : "Not Converted ",
"\n";
my $ofs=1;
while ($ofs<$size and $ofs<$max) {
my $pos=$ofs;
printf "%6d=%s\n",$pos,
join(":",map {
sprintf "%6d",
unpack PACKTYPE, substr($$trie,($ofs++)*4,4)
} 0 .. $charc);
}
if ($ofs>$max) {
print "(Truncated trie dump due to size)\n";
}
}
1;
__DATA__
__C__
#undef trie_DEBUG
#undef trie_PACKED_RETURN
#ifdef trie_PACKED_RETURN
struct match_return {
UV ofs;
UV diff;
UV match;
};
typedef struct match_return Match_Return;
#endif
/*
search_string - string to be searched
search_len - length of string being searched
strlen - length of matched word
veclen - length of a single vector string.
seenvec - string of strlen*veclen bytes for the bitvector
trie - array of long. Value elements are expected to point
into stridx.
stridx - index of offsets and words. Ofsets are always negative
OFS, IDXPTR,[ IDXPTR...] , OFS, IDXPTR,[ IDXPTR...], 0
strings - array of char holding all of the strings. stridx
points into this structure via IDXPTR elements.
charmap - convert ASCII char codes into indexes for the trie.
Ie A -> 0 C -> 1, etc.
charcount - How many chars are there in each node of the trie
fuzz - how many characters fuzz we want
start - start state. should always be 1.
Returns an arrayref containing tripples for each match:
OFS, Matches String, Difference
*/
AV* _c_fuzz_search(char *search_string,
UV search_len,
UV strlen,
UV veclen,
char *seenvec,
UV *trie,
IV *stridx,
char *strings,
char *charmap,
UV charcount,
UV fuzz,
UV start)
{
UV ofs;
UV cpos = 0;
UV upto = search_len-strlen;
UV bmaxlen = strlen+1;
const unsigned char mask[8]={128,64,32,16,8,4,2,1};
UV num_fullchecks=0;
AV* return_array=newAV();
start = start ? start : 1;
ofs = start;
#ifdef trie_DEBUG
printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
#endif
while( search_string[cpos] && ofs ) {
UV x;
UV idx;
char curchar;
unsigned char curcharidx;
#ifdef trie_DEBUG
printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,st
+rlen,veclen);
#endif
Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
curchar=search_string[cpos++];
curcharidx=charmap[ curchar ];
#ifdef trie_DEBUG
printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,of
+s,curchar,curcharidx);
#endif
if (curcharidx==255) {
#ifdef trie_DEBUG
printf("Reset to start: %d (bad char)\n",ofs);
#endif
ofs = start;
continue;
}
ofs = trie[ ofs + curcharidx ];
#ifdef trie_DEBUG
printf("New ofs: %d\n",ofs);
#endif
idx=trie[ofs+charcount];
if (!idx)
continue;
#ifdef trie_DEBUG
printf("Match idx: %d\n",idx);
#endif
while (stridx[idx]<0) {
UV srch_ofs;
IV sofs = stridx[idx++];
if ((sofs + cpos<0)||(sofs + cpos > upto)) {
while(stridx[idx]>0) idx++;
continue;
}
srch_ofs = cpos + sofs;
#ifdef trie_DEBUG
printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
#endif
x= (srch_ofs % strlen)*veclen;
while (stridx[idx]>0) {
UV check_byte=x+(long)(stridx[idx]/8);
unsigned char check_bit=stridx[idx] % 8;
unsigned char seen=mask[check_bit] & seenvec[check_byte];
#ifdef trie_DEBUG
printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n",
stridx[idx],check_byte,check_bit,seen);
#endif
if (!seen) {
char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
char *s= &search_string[ srch_ofs ];
UV diff=0;
num_fullchecks++;
seenvec[check_byte]=seenvec[check_byte] | mask[check_bit];
#ifdef trie_DEBUG
printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n",s
+rch_ofs,stridx[idx],cpos);
printf("\t> C '%s' | S %s = ",c,s);
#endif
while (*c && diff<=fuzz) {
if (*c++ != *s++) diff++;
}
#ifdef trie_DEBUG
printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
#endif
if (diff <= fuzz) {
#ifdef trie_PACKED_RETURN
{
Match_Return ret;
#ifdef HAS_HTONL
ret.ofs=PerlSock_htonl(ofs);
ret.diff=PerlSock_htonl(diff);
ret.match=PerlSock_htonl((stridx[idx]-1));
#else
ret.ofs=ofs;
ret.diff=diff;
ret.match=(stridx[idx]-1);
#endif
{
SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
av_push(return_array,sv);
}
}
#else
{
SV *sv=newSViv((UV)(srch_ofs));
av_push(return_array,sv);
}
{
SV *sv=newSViv((IV)diff);
av_push(return_array,sv);
}
{
SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0);
av_push(return_array,sv);
}
#endif
}
}
idx++;
}
}
}
{
SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks",
+TRUE);
sv_setuv(num_fullchecks_sv,num_fullchecks);
}
return return_array;
}
Re^4: Algorithm Showdown: Fuzzy Matching (DFA.pm)
by demerphq (Chancellor) on Dec 11, 2004 at 11:36 UTC
|
--- PostedDFA.pm 2004-12-11 12:31:19.714617600 +0100
+++ DFA.pm 2004-12-11 15:37:03.829067200 +0100
@@ -48,6 +48,15 @@
sub _init {
my ($self,$chars)=@_;
+ if ($chars) {
+ $self->_set_chars($chars);
+ }
+ return $self;
+}
+
+
+sub _set_chars {
+ my ($self,$chars)=@_;
$chars='ACGT' unless defined $chars;
my @achars;
@@ -55,18 +64,17 @@
$self->{charcount}=0;
$self->{charlist}=[];
for my $c (split //,($chars||'ACGT')) {
- next if $hchars{$c} || $hchars{lc($c)};
+ next if $hchars{$c};
push @{$self->{charlist}},$c;
$hchars{$c} = $self->{charcount};
$achars[ord $c] = $self->{charcount};
- $hchars{lc($c)} = $self->{charcount};
- $achars[ord(lc($c))] = $self->{charcount};
$self->{charcount}++
}
+
@{$self}{'achars','bchars','hchars'}=(
\@achars,
- pack( "C*", map { defined $_ ? $_ : 255 } @achars),
+ pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..25
+5),
\%hchars
);
@@ -77,7 +85,6 @@
return $self;
}
-
# Associates an integer to a string in the trie.
# trie is a flat string of packed longs.
# Space allocated for the trie increases by doubling
@@ -128,8 +135,9 @@
# we will never store strings that can completely overlap each other.
# Ie if ABBA and BB are valid strings and we search ABBA, we will not
# record that we matched both, only ABBA. This isnt a problem for
-# the purposes at hand as all of our words will be most 1 character
-# different in lengths.
+# the purposes at hand as all of our words will be the same length.
+# A more complicated storage structure could facilitate the issue of
+# overlaps.
# The Idea here is we recurse through the trie keeping "cursors" that
# represent where we would be if we backtracked at each new letter
# encountered. When we find an empty slot we use the state of the
@@ -206,6 +214,11 @@
return if $self->{str_hash}{$str}++;
+ unless ($self->{charlist}) {
+ $self->{fuzzed_chars}{$_}++
+ for split //,$str
+ }
+
my $id=++$self->{strings_stored};
my $fuzz=$self->{fuzz};
@@ -254,6 +267,12 @@
#
sub prepare {
my $self=shift;
+
+ unless ($self->{charlist}) {
+ $self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}});
+ }
+
+
my $partshash=($self->{parts_hash}||={});
my $parts_idx=pack PACKTYPE,0;
@@ -336,19 +355,26 @@
" bytes) Uses:".length($$trie)." bytes\n".
"Trie Nodes Allocated: $nodes\n".
($strings_stored||0)." Fuzzy Strings Stored.\n";
- #print Dump($strings);
+
print "Used ".length($$strings)." bytes to store strings\n";
+ my $max=50;
if ($partsidx) {
print "Used ".length($$partsidx)." to store index\n";
- #my @unpack=unpack PACKTYPE."*",$$partsidx;
- #print "@unpack\n";
+ my @unpack=unpack PACKTYPE."*",$$partsidx;
+
+ if (@unpack>$max) {
+ my $count=splice @unpack,$max,-1;
+ print join(" ",@unpack),"... [$count]\n";
+ } else {
+ print join(" ",@unpack),"\n";
+ }
}
print $self->{fuzzed} ? "Prepared " : "Not Prepared ",
$self->{filled_in} ? "Converted " : "Not Converted ",
"\n";
my $ofs=1;
- while ($ofs<$size and $ofs<1000) {
+ while ($ofs<$size and $ofs<$max) {
my $pos=$ofs;
printf "%6d=%s\n",$pos,
join(":",map {
@@ -356,7 +382,7 @@
unpack PACKTYPE, substr($$trie,($ofs++)*4,4)
} 0 .. $charc);
}
- if ($ofs>1000) {
+ if ($ofs>$max) {
print "(Truncated trie dump due to size)\n";
}
@@ -413,147 +439,155 @@
char *charmap,
UV charcount,
UV fuzz,
- UV start) {
- UV ofs;
- UV cpos = 0;
- UV upto = search_len-strlen;
- UV bmaxlen = strlen+1;
- const unsigned char mask[8]={128,64,32,16,8,4,2,1};
- UV num_fullchecks=0;
- AV* return_array=newAV();
+ UV start)
+{
+ UV ofs;
+ UV cpos = 0;
+ UV upto = search_len-strlen;
+ UV bmaxlen = strlen+1;
+ const unsigned char mask[8]={128,64,32,16,8,4,2,1};
+ UV num_fullchecks=0;
+ AV* return_array=newAV();
- start = start ? start : 1;
- ofs = start;
+ start = start ? start : 1;
+ ofs = start;
+ #ifdef trie_DEBUG
+ printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
+ #endif
+
+ while( search_string[cpos] && ofs ) {
+ UV x;
+ UV idx;
+ char curchar;
+ unsigned char curcharidx;
+
#ifdef trie_DEBUG
- printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string);
+ printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,s
+trlen,veclen);
#endif
- while( search_string[cpos] && ofs ) {
- UV x;
- char c;
-
- #ifdef trie_DEBUG
- printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,
+cpos,strlen,veclen);
- #endif
+ Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
+ curchar=search_string[cpos++];
+ curcharidx=charmap[ curchar ];
- Zero(&seenvec[(cpos % strlen)*veclen],veclen,char);
- /*
- for (x=0;x<veclen;x++)
- seenvec[(cpos % strlen)*veclen+x]=0;
- */
+ #ifdef trie_DEBUG
+ printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,o
+fs,curchar,curcharidx);
+ #endif
- #ifdef trie_DEBUG
- printf("Pos: %d -- State: %d -- Char: %c\n",cpos,ofs,search
+_string[cpos]);
- #endif
+ if (curcharidx==255) {
+ #ifdef trie_DEBUG
+ printf("Reset to start: %d (bad char)\n",ofs);
+ #endif
+ ofs = start;
+ continue;
+ }
- c=charmap[ search_string[cpos++] ];
- if (c==255) {
- ofs = start;
- } else {
- ofs = trie[ ofs + c ];
+ ofs = trie[ ofs + curcharidx ];
- #ifdef trie_DEBUG
- printf("ofs: %d\n",ofs);
- #endif
+ #ifdef trie_DEBUG
+ printf("New ofs: %d\n",ofs);
+ #endif
- if (trie[ofs+charcount]) {
+ idx=trie[ofs+charcount];
+ if (!idx)
+ continue;
- UV idx=trie[ofs+charcount];
+ #ifdef trie_DEBUG
+ printf("Match idx: %d\n",idx);
+ #endif
- #ifdef trie_DEBUG
- printf("Match idx: %d\n",idx);
- #endif
+ while (stridx[idx]<0) {
+ UV srch_ofs;
+ IV sofs = stridx[idx++];
- while (stridx[idx]) {
- IV sofs = stridx[idx++];
- UV srch_ofs = cpos + sofs;
+ if ((sofs + cpos<0)||(sofs + cpos > upto)) {
+ while(stridx[idx]>0) idx++;
+ continue;
+ }
- #ifdef trie_DEBUG
- printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
- #endif
+ srch_ofs = cpos + sofs;
- x= (srch_ofs % strlen)*veclen;
+ #ifdef trie_DEBUG
+ printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs);
+ #endif
- while (stridx[idx]>0) {
+ x= (srch_ofs % strlen)*veclen;
- if ( srch_ofs>=0 && srch_ofs<=upto ) {
- UV check_byte=x+(long)(stridx[idx]/8);
- unsigned char check_bit=stridx[idx] % 8;
- unsigned char seen=mask[check_bit] & seenvec[check_
+byte];
+ while (stridx[idx]>0) {
+ UV check_byte=x+(long)(stridx[idx]/8);
+ unsigned char check_bit=stridx[idx] % 8;
+ unsigned char seen=mask[check_bit] & seenvec[check_byte];
- #ifdef trie_DEBUG
- printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d
+\n",
- stridx[idx],check_byte,check_bit,seen);
- #endif
+ #ifdef trie_DEBUG
+ printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n",
+ stridx[idx],check_byte,check_bit,seen);
+ #endif
- if (!seen) {
- char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
- char *s= &search_string[ srch_ofs ];
- UV diff=0;
- num_fullchecks++;
- seenvec[check_byte]=seenvec[check_byte] | mask[ch
+eck_bit];
+ if (!seen) {
+ char *c= &strings[ (stridx[idx]-1)*bmaxlen ];
+ char *s= &search_string[ srch_ofs ];
+ UV diff=0;
+ num_fullchecks++;
+ seenvec[check_byte]=seenvec[check_byte] | mask[check_bit];
- #ifdef trie_DEBUG
- printf("\tsearching at ofs %d / strings_of %d (
+cpos:%d)\n",srch_ofs,stridx[idx],cpos);
- printf("\t> C '%s' | S %s = ",c,s);
- #endif
+ #ifdef trie_DEBUG
+ printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n",
+srch_ofs,stridx[idx],cpos);
+ printf("\t> C '%s' | S %s = ",c,s);
+ #endif
- while (*c && diff<=fuzz) {
- if (*c++ != *s++) diff++;
- }
+ while (*c && diff<=fuzz) {
+ if (*c++ != *s++) diff++;
+ }
- #ifdef trie_DEBUG
- printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
- #endif
+ #ifdef trie_DEBUG
+ printf("\tDiff: %d Fuzz:%d\n",diff,fuzz);
+ #endif
- if (diff <= fuzz) {
- #ifdef trie_PACKED_RETURN
- {
- Match_Return ret;
- #ifdef HAS_HTONL
- ret.ofs=PerlSock_htonl(ofs);
- ret.diff=PerlSock_htonl(diff);
- ret.match=PerlSock_htonl((stridx[idx]-1));
- #else
- ret.ofs=ofs;
- ret.diff=diff;
- ret.match=(stridx[idx]-1);
- #endif
- {
- SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
- av_push(return_array,sv);
- }
+ if (diff <= fuzz) {
+#ifdef trie_PACKED_RETURN
+ {
+ Match_Return ret;
+#ifdef HAS_HTONL
+ ret.ofs=PerlSock_htonl(ofs);
+ ret.diff=PerlSock_htonl(diff);
+ ret.match=PerlSock_htonl((stridx[idx]-1));
+#else
+ ret.ofs=ofs;
+ ret.diff=diff;
+ ret.match=(stridx[idx]-1);
+#endif
+ {
+ SV *sv=newSVpvn((char*)(&ret),sizeof(ret));
+ av_push(return_array,sv);
+ }
- }
- #else
- {
- SV *sv=newSViv((UV)(srch_ofs));
- av_push(return_array,sv);
- }
- {
- SV *sv=newSViv((IV)diff);
- av_push(return_array,sv);
- }
- {
- SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmax
+len],0);
- av_push(return_array,sv);
- }
- #endif
- }
- }
- }
- idx++;
- } /* each string */
- } /* each ofs */
- } /* do we have a hit */
- } /* legal char? */
- } /* walk the string */
- {
- SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullCheck
+s",TRUE);
- sv_setuv(num_fullchecks_sv,num_fullchecks);
+ }
+#else
+ {
+ SV *sv=newSViv((UV)(srch_ofs));
+ av_push(return_array,sv);
+ }
+ {
+ SV *sv=newSViv((IV)diff);
+ av_push(return_array,sv);
+ }
+ {
+ SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0);
+ av_push(return_array,sv);
+ }
+#endif
+ }
+ }
+ idx++;
+ }
}
- return return_array;
+ }
+ {
+ SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks"
+,TRUE);
+ sv_setuv(num_fullchecks_sv,num_fullchecks);
+ }
+ return return_array;
}
+
| [reply] [d/l] |
|
|