Beefy Boxes and Bandwidth Generously Provided by pair Networks
Pathologically Eclectic Rubbish Lister
 
PerlMonks  

Re^3: Algorithm Showdown: Fuzzy Matching (DFA.pm)

by demerphq (Chancellor)
on Dec 09, 2004 at 22:28 UTC ( [id://413705]=note: print w/replies, xml ) Need Help??


in reply to Re^2: Algorithm Showdown: Fuzzy Matching (Matcher.pm)
in thread Algorithm Showdown: Fuzzy Matching

This version is my implementation of ysths idea. Like him I split the input strings into Fuzz+1 parts and then use something like Xor to count the difference. But i use a DFA to find all of the exact matches. This is a hybrid Perl/Inline::C solution, with the storage and memory management done pretty much entirely in Perl, and the search routine done in Inline::C. It is slow to prepare() but searches are typically extremely quick. Like ysths solution on degenerate datasets it performs as well/badly as Xor.pm but these cases would seem to be unusual. The larger the ratio of WordChars/Fuzz is the faster this algorithm will perform (and the more memory it will use.)

Updated:Patches applied. Fix searching strings with non alphabet chars in them. Make providing the alphabet optional. Fix error where a UV was being assigned a negative number. Minor C cleanup.

package Fuzzy::Matcher::DFA; use strict; use warnings; use vars qw/ $VERSION $Typemap @ISA /; use constant PACKTYPE=>"l!"; use Data::Dump::Streamer; use Fuzzy::Matcher; use Carp; @ISA=qw(Fuzzy::Matcher); $VERSION=0.01; # Underlying idea by [ysth], DFA/Inline::C implementation by [demerphq +] BEGIN { # this is required by certain Perl versions $| = 1; $Typemap = __FILE__.".typemap"; my $typemap=<<'TYPEMAP'; long * T_OPAQUEPTR float * T_OPAQUEPTR UV * T_OPAQUEPTR IV * T_OPAQUEPTR TYPEMAP unless ( -e $Typemap and -s $Typemap==length($typemap) ) { open my $tm, ">", $Typemap or die "$Typemap:$!"; print $tm $typemap; close $tm; } } # use Inline qw(Info Noisy); use Inline C => Config => # ENABLE => 'FORCE_BUILD', # DISABLE => 'CLEAN_AFTER_BUILD', TYPEMAPS => $Typemap, VERSION => $VERSION, NAME => __PACKAGE__, ; use Inline 'C'; # create a new Fuzzy Matcher. This mostly deals with # alphabet/character type issues. The trie we build # needs this info. Uppercase and lowercase letters # are treated as the same character. sub _init { my ($self,$chars)=@_; if ($chars) { $self->_set_chars($chars); } return $self; } sub _set_chars { my ($self,$chars)=@_; $chars='ACGT' unless defined $chars; my @achars; my %hchars; $self->{charcount}=0; $self->{charlist}=[]; for my $c (split //,($chars||'ACGT')) { next if $hchars{$c}; push @{$self->{charlist}},$c; $hchars{$c} = $self->{charcount}; $achars[ord $c] = $self->{charcount}; $self->{charcount}++ } @{$self}{'achars','bchars','hchars'}=( \@achars, pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..255 +), \%hchars ); my $str="\0" x (1024*1024); $self->{trie}=\$str; $self->{size}=$self->{charcount}+2; $self->{nodes}=1; return $self; } # Associates an integer to a string in the trie. # trie is a flat string of packed longs. # Space allocated for the trie increases by doubling # and starts at 1k sub _trie_store { my ($self,$str,$integer)=@_; croak "Can't _trie_store() after prepare or conversion." if $self->{fuzzed} or $self->{filled_in}; my ($trie,$data,$hchars,$size,$charc)=@{$self}{qw(trie data hchars s +ize charcount)}; my $pos=1; my @chars=split //,$str; if (@chars) { foreach my $char ( @chars ) { my $ofs=$pos+$hchars->{$char}; my $v=unpack PACKTYPE,substr($$trie,$ofs*4,4); unless ($v) { $pos=$size; $size+=$charc+1; $self->{nodes}++; $self->{size}=$size; if ($size*4>length($$trie)) { $$trie.="\0" x length($$trie); } substr($$trie,$ofs*4,4)=pack PACKTYPE,$pos; } else { $pos=$v; } } } $self->{size}=$size; if ( my $idx=unpack PACKTYPE,substr($$trie, ($pos + $charc)*4,4) ) { substr($$trie, ($pos + $charc)*4,4)= pack PACKTYPE,$integer; return $integer } else { substr($$trie, ($pos + $charc)*4,4)=pack PACKTYPE,$integer; return $integer; } } # Recursive routine which fills in any 0 slots in the trie with the # appropriate state. Note that this relies on the fact that # we will never store strings that can completely overlap each other. # Ie if ABBA and BB are valid strings and we search ABBA, we will not # record that we matched both, only ABBA. This isnt a problem for # the purposes at hand as all of our words will be the same length. # A more complicated storage structure could facilitate the issue of # overlaps. # The Idea here is we recurse through the trie keeping "cursors" that # represent where we would be if we backtracked at each new letter # encountered. When we find an empty slot we use the state of the # oldest valid cursor as the new state. This is much faster than # the older version I used. sub _fill_in { my ($self,$node,$path,@csrs)=@_; my ($trie,$charc,$list,$bchars)=@{$self}{qw(trie charcount charlist +bchars)}; my @defer; #print "$node> $path\n"; my $added=0; foreach my $char (0..$charc-1) { my @csrs=@csrs; @csrs=grep { $_=unpack PACKTYPE,substr($$trie,($_+$char)*4,4); $_ } @csrs; #print "try: '$path$list->[$char]' (@csrs)\n"; my $v=unpack PACKTYPE,substr($$trie,($node+$char)*4,4); if ($v) { $added+=$self->_fill_in($v,$path.$list->[$char],@csrs,1); } else { my ($state)=(@csrs,1); push @defer,[$char,$state] } } foreach my $defer (@defer) { my ($char,$state)=@$defer; substr($$trie,($node+$char)*4,4)=pack PACKTYPE,$state; #print "$path$list->[$char] FAIL ($state)\n"; $added++; } return $added; } # # Just a wrapper to _fill_in() # sub convert_trie_to_dfa { my $self=shift; croak "Can't convert twice." if $self->{filled_in}; $self->{filled_in}=1; my $added=$self->_fill_in(1,""); print "Filled in $added elements\n" unless defined wantarray; return $added; } # # Takes a word to fuzzy match, if the word is new stores it in # a two different places. First in a packed string form, # as a null terminated part of a larger string containing all of the # words to match. Also does the "split into fuzz+1 parts" trick and # builds an index showing that if a given part is identified what # words should be checked in full and at what offset (from the positio +n # of the last character required to match the string) to do the match # at. # Once the fuzzy word list has been built a prepare_for_search() # converts the interim structures into something usable by the Inline: +:C # code. sub fuzz_store { my ($self,$str)=@_; croak "Can't fuzz_store() after prepare or conversion." if $self->{fuzzed} or $self->{filled_in}; return if $self->{str_hash}{$str}++; unless ($self->{charlist}) { $self->{fuzzed_chars}{$_}++ for split //,$str } my $id=++$self->{strings_stored}; my $fuzz=$self->{fuzz}; my $l=($self->{strlen}||=length($str)); my $bl=$l+1; #print "Storing '$str' as $id\n"; my $strings=($self->{strings}||=do { \(my $buff="\0" x (1024*1024)); }); if($id * $bl > length($$strings)) { $$strings.="\0" x length($$strings); } substr($$strings,($id-1)*$bl,$l)=$str; #unpack is used to split the word appropriatly my $upk=($self->{fuzzpack}||=do{ my $size=int($l/($fuzz+1)); $self->{max_part_size}=$size; "A$size" x ($fuzz+1) }); my @parts=unpack $upk,$str; my $partshash=($self->{parts_hash}||={}); my $pos=0; # we have the list of parts, now insert them # into the the interim HOH foreach my $p (0..$#parts) { #print "$parts[$p] ".(-$pos-length($parts[$p]))." $str\n"; $partshash->{$parts[$p]}{-$pos-length($parts[$p])}.=pack PACKTYPE, +$id; $pos+=length $parts[$p]; } } # # Converts interim structures into the final Trie and # and related XOR indexes. After this routine has been # called no more strings may be added to the object, # and it is ready for doing the search # sub prepare { my $self=shift; unless ($self->{charlist}) { $self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}}); } my $partshash=($self->{parts_hash}||={}); my $parts_idx=pack PACKTYPE,0; #foreach my $part (sort keys %$partshash) { # print "$part \n"; # foreach my $ofs (sort {$a<=>$b }keys %{$partshash->{$part}}) { # print "$ofs : ",join " ",unpack(PACKTYPE."*",$partshash->{$part +}{$ofs}),"\n"; # } #} while (my ($part,$hash)=each %$partshash) { my $long=length($parts_idx)/4; $parts_idx.=join "",(map { pack(PACKTYPE,$_).$hash->{$_} } keys %$hash), pack(PACKTYPE,0); $self->{parts_stored}++; $self->_trie_store($part,$long); delete $partshash->{$part}; } $self->{partsidx}=\$parts_idx; my $filled=$self->convert_trie_to_dfa; $self->{veclen}=int($self->{strings_stored} / 8)+1; $self->{vector}="\0" x ($self->{veclen}*$self->{strlen}*2); # *2 for safe measure. Heck we can afford it. undef $self->{str_hash}; $self->{fuzzed}=1; # XXX this should block other methods. } # # This is a wrapper to the C search routine. It provides # space for a bitvector based "check test" of the most # recently tested words and offsets. The idea is that we # only ever need to test a word once at a given offset. # The problem is that we will get spurious dupes when # there is little or no fuzz involved in the match. For # fuzz=2 we will report the same string 3 times if there # is an exact match. The bitvector allows is to keep # track whether we have already check a strng at a given # offset. This means we need a bitvector for each character # in the searched words (not fragment length). sub fuzz_search { my ($self,$str)=@_; croak "Can't search before prepare_for_search()" unless $self->{fuzzed}; my $ary=_c_fuzz_search($str, length($str), $self->{strlen}, $self->{veclen}, $self->{vector}, ${$self->{trie}}, ${$self->{partsidx}}, ${$self->{strings}}, $self->{bchars}, $self->{charcount}, $self->{fuzz},1); $self->{NumFullChecks}=$Fuzzy::Matcher::DFA::NumFullChecks; return $ary; } # # Some diagnostic stuff. Mostly for debugging # sub show { my $self=shift; my ($trie,$charc,$size,$strings_stored,$parts_stored,$nodes,$strings +,$partsidx) = @{$self}{qw(trie charcount size strings_stored parts_stored nod +es strings partsidx)}; print "== Fuzzy Trie == [$trie] Chars: $charc\n". "Allocated Values: $size (".($size*4). " bytes) Uses:".length($$trie)." bytes\n". "Trie Nodes Allocated: $nodes\n". ($strings_stored||0)." Fuzzy Strings Stored.\n"; print "Used ".length($$strings)." bytes to store strings\n"; my $max=50; if ($partsidx) { print "Used ".length($$partsidx)." to store index\n"; my @unpack=unpack PACKTYPE."*",$$partsidx; if (@unpack>$max) { my $count=splice @unpack,$max,-1; print join(" ",@unpack),"... [$count]\n"; } else { print join(" ",@unpack),"\n"; } } print $self->{fuzzed} ? "Prepared " : "Not Prepared ", $self->{filled_in} ? "Converted " : "Not Converted ", "\n"; my $ofs=1; while ($ofs<$size and $ofs<$max) { my $pos=$ofs; printf "%6d=%s\n",$pos, join(":",map { sprintf "%6d", unpack PACKTYPE, substr($$trie,($ofs++)*4,4) } 0 .. $charc); } if ($ofs>$max) { print "(Truncated trie dump due to size)\n"; } } 1; __DATA__ __C__ #undef trie_DEBUG #undef trie_PACKED_RETURN #ifdef trie_PACKED_RETURN struct match_return { UV ofs; UV diff; UV match; }; typedef struct match_return Match_Return; #endif /* search_string - string to be searched search_len - length of string being searched strlen - length of matched word veclen - length of a single vector string. seenvec - string of strlen*veclen bytes for the bitvector trie - array of long. Value elements are expected to point into stridx. stridx - index of offsets and words. Ofsets are always negative OFS, IDXPTR,[ IDXPTR...] , OFS, IDXPTR,[ IDXPTR...], 0 strings - array of char holding all of the strings. stridx points into this structure via IDXPTR elements. charmap - convert ASCII char codes into indexes for the trie. Ie A -> 0 C -> 1, etc. charcount - How many chars are there in each node of the trie fuzz - how many characters fuzz we want start - start state. should always be 1. Returns an arrayref containing tripples for each match: OFS, Matches String, Difference */ AV* _c_fuzz_search(char *search_string, UV search_len, UV strlen, UV veclen, char *seenvec, UV *trie, IV *stridx, char *strings, char *charmap, UV charcount, UV fuzz, UV start) { UV ofs; UV cpos = 0; UV upto = search_len-strlen; UV bmaxlen = strlen+1; const unsigned char mask[8]={128,64,32,16,8,4,2,1}; UV num_fullchecks=0; AV* return_array=newAV(); start = start ? start : 1; ofs = start; #ifdef trie_DEBUG printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string); #endif while( search_string[cpos] && ofs ) { UV x; UV idx; char curchar; unsigned char curcharidx; #ifdef trie_DEBUG printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,st +rlen,veclen); #endif Zero(&seenvec[(cpos % strlen)*veclen],veclen,char); curchar=search_string[cpos++]; curcharidx=charmap[ curchar ]; #ifdef trie_DEBUG printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,of +s,curchar,curcharidx); #endif if (curcharidx==255) { #ifdef trie_DEBUG printf("Reset to start: %d (bad char)\n",ofs); #endif ofs = start; continue; } ofs = trie[ ofs + curcharidx ]; #ifdef trie_DEBUG printf("New ofs: %d\n",ofs); #endif idx=trie[ofs+charcount]; if (!idx) continue; #ifdef trie_DEBUG printf("Match idx: %d\n",idx); #endif while (stridx[idx]<0) { UV srch_ofs; IV sofs = stridx[idx++]; if ((sofs + cpos<0)||(sofs + cpos > upto)) { while(stridx[idx]>0) idx++; continue; } srch_ofs = cpos + sofs; #ifdef trie_DEBUG printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs); #endif x= (srch_ofs % strlen)*veclen; while (stridx[idx]>0) { UV check_byte=x+(long)(stridx[idx]/8); unsigned char check_bit=stridx[idx] % 8; unsigned char seen=mask[check_bit] & seenvec[check_byte]; #ifdef trie_DEBUG printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n", stridx[idx],check_byte,check_bit,seen); #endif if (!seen) { char *c= &strings[ (stridx[idx]-1)*bmaxlen ]; char *s= &search_string[ srch_ofs ]; UV diff=0; num_fullchecks++; seenvec[check_byte]=seenvec[check_byte] | mask[check_bit]; #ifdef trie_DEBUG printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n",s +rch_ofs,stridx[idx],cpos); printf("\t> C '%s' | S %s = ",c,s); #endif while (*c && diff<=fuzz) { if (*c++ != *s++) diff++; } #ifdef trie_DEBUG printf("\tDiff: %d Fuzz:%d\n",diff,fuzz); #endif if (diff <= fuzz) { #ifdef trie_PACKED_RETURN { Match_Return ret; #ifdef HAS_HTONL ret.ofs=PerlSock_htonl(ofs); ret.diff=PerlSock_htonl(diff); ret.match=PerlSock_htonl((stridx[idx]-1)); #else ret.ofs=ofs; ret.diff=diff; ret.match=(stridx[idx]-1); #endif { SV *sv=newSVpvn((char*)(&ret),sizeof(ret)); av_push(return_array,sv); } } #else { SV *sv=newSViv((UV)(srch_ofs)); av_push(return_array,sv); } { SV *sv=newSViv((IV)diff); av_push(return_array,sv); } { SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0); av_push(return_array,sv); } #endif } } idx++; } } } { SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks", +TRUE); sv_setuv(num_fullchecks_sv,num_fullchecks); } return return_array; }
---
demerphq

Replies are listed 'Best First'.
Re^4: Algorithm Showdown: Fuzzy Matching (DFA.pm)
by demerphq (Chancellor) on Dec 11, 2004 at 11:36 UTC

    Patch applied to fix the bug BrowserUk identified and also a C cleanup. (Updated Patch)

    --- PostedDFA.pm 2004-12-11 12:31:19.714617600 +0100 +++ DFA.pm 2004-12-11 15:37:03.829067200 +0100 @@ -48,6 +48,15 @@ sub _init { my ($self,$chars)=@_; + if ($chars) { + $self->_set_chars($chars); + } + return $self; +} + + +sub _set_chars { + my ($self,$chars)=@_; $chars='ACGT' unless defined $chars; my @achars; @@ -55,18 +64,17 @@ $self->{charcount}=0; $self->{charlist}=[]; for my $c (split //,($chars||'ACGT')) { - next if $hchars{$c} || $hchars{lc($c)}; + next if $hchars{$c}; push @{$self->{charlist}},$c; $hchars{$c} = $self->{charcount}; $achars[ord $c] = $self->{charcount}; - $hchars{lc($c)} = $self->{charcount}; - $achars[ord(lc($c))] = $self->{charcount}; $self->{charcount}++ } + @{$self}{'achars','bchars','hchars'}=( \@achars, - pack( "C*", map { defined $_ ? $_ : 255 } @achars), + pack( "C*", map { defined $achars[$_] ? $achars[$_] : 255 } 0..25 +5), \%hchars ); @@ -77,7 +85,6 @@ return $self; } - # Associates an integer to a string in the trie. # trie is a flat string of packed longs. # Space allocated for the trie increases by doubling @@ -128,8 +135,9 @@ # we will never store strings that can completely overlap each other. # Ie if ABBA and BB are valid strings and we search ABBA, we will not # record that we matched both, only ABBA. This isnt a problem for -# the purposes at hand as all of our words will be most 1 character -# different in lengths. +# the purposes at hand as all of our words will be the same length. +# A more complicated storage structure could facilitate the issue of +# overlaps. # The Idea here is we recurse through the trie keeping "cursors" that # represent where we would be if we backtracked at each new letter # encountered. When we find an empty slot we use the state of the @@ -206,6 +214,11 @@ return if $self->{str_hash}{$str}++; + unless ($self->{charlist}) { + $self->{fuzzed_chars}{$_}++ + for split //,$str + } + my $id=++$self->{strings_stored}; my $fuzz=$self->{fuzz}; @@ -254,6 +267,12 @@ # sub prepare { my $self=shift; + + unless ($self->{charlist}) { + $self->_set_chars(join "",sort keys %{$self->{fuzzed_chars}}); + } + + my $partshash=($self->{parts_hash}||={}); my $parts_idx=pack PACKTYPE,0; @@ -336,19 +355,26 @@ " bytes) Uses:".length($$trie)." bytes\n". "Trie Nodes Allocated: $nodes\n". ($strings_stored||0)." Fuzzy Strings Stored.\n"; - #print Dump($strings); + print "Used ".length($$strings)." bytes to store strings\n"; + my $max=50; if ($partsidx) { print "Used ".length($$partsidx)." to store index\n"; - #my @unpack=unpack PACKTYPE."*",$$partsidx; - #print "@unpack\n"; + my @unpack=unpack PACKTYPE."*",$$partsidx; + + if (@unpack>$max) { + my $count=splice @unpack,$max,-1; + print join(" ",@unpack),"... [$count]\n"; + } else { + print join(" ",@unpack),"\n"; + } } print $self->{fuzzed} ? "Prepared " : "Not Prepared ", $self->{filled_in} ? "Converted " : "Not Converted ", "\n"; my $ofs=1; - while ($ofs<$size and $ofs<1000) { + while ($ofs<$size and $ofs<$max) { my $pos=$ofs; printf "%6d=%s\n",$pos, join(":",map { @@ -356,7 +382,7 @@ unpack PACKTYPE, substr($$trie,($ofs++)*4,4) } 0 .. $charc); } - if ($ofs>1000) { + if ($ofs>$max) { print "(Truncated trie dump due to size)\n"; } @@ -413,147 +439,155 @@ char *charmap, UV charcount, UV fuzz, - UV start) { - UV ofs; - UV cpos = 0; - UV upto = search_len-strlen; - UV bmaxlen = strlen+1; - const unsigned char mask[8]={128,64,32,16,8,4,2,1}; - UV num_fullchecks=0; - AV* return_array=newAV(); + UV start) +{ + UV ofs; + UV cpos = 0; + UV upto = search_len-strlen; + UV bmaxlen = strlen+1; + const unsigned char mask[8]={128,64,32,16,8,4,2,1}; + UV num_fullchecks=0; + AV* return_array=newAV(); - start = start ? start : 1; - ofs = start; + start = start ? start : 1; + ofs = start; + #ifdef trie_DEBUG + printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string); + #endif + + while( search_string[cpos] && ofs ) { + UV x; + UV idx; + char curchar; + unsigned char curcharidx; + #ifdef trie_DEBUG - printf("_c_fuzz_search %d %d %s \n",start,fuzz,search_string); + printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen,cpos,s +trlen,veclen); #endif - while( search_string[cpos] && ofs ) { - UV x; - char c; - - #ifdef trie_DEBUG - printf ("Wiping %d (%d %% %d)*%d\n",(cpos % strlen)*veclen, +cpos,strlen,veclen); - #endif + Zero(&seenvec[(cpos % strlen)*veclen],veclen,char); + curchar=search_string[cpos++]; + curcharidx=charmap[ curchar ]; - Zero(&seenvec[(cpos % strlen)*veclen],veclen,char); - /* - for (x=0;x<veclen;x++) - seenvec[(cpos % strlen)*veclen+x]=0; - */ + #ifdef trie_DEBUG + printf("Pos: %d -- State: %d -- Char: '%c' CIdx: '%d'\n",cpos-1,o +fs,curchar,curcharidx); + #endif - #ifdef trie_DEBUG - printf("Pos: %d -- State: %d -- Char: %c\n",cpos,ofs,search +_string[cpos]); - #endif + if (curcharidx==255) { + #ifdef trie_DEBUG + printf("Reset to start: %d (bad char)\n",ofs); + #endif + ofs = start; + continue; + } - c=charmap[ search_string[cpos++] ]; - if (c==255) { - ofs = start; - } else { - ofs = trie[ ofs + c ]; + ofs = trie[ ofs + curcharidx ]; - #ifdef trie_DEBUG - printf("ofs: %d\n",ofs); - #endif + #ifdef trie_DEBUG + printf("New ofs: %d\n",ofs); + #endif - if (trie[ofs+charcount]) { + idx=trie[ofs+charcount]; + if (!idx) + continue; - UV idx=trie[ofs+charcount]; + #ifdef trie_DEBUG + printf("Match idx: %d\n",idx); + #endif - #ifdef trie_DEBUG - printf("Match idx: %d\n",idx); - #endif + while (stridx[idx]<0) { + UV srch_ofs; + IV sofs = stridx[idx++]; - while (stridx[idx]) { - IV sofs = stridx[idx++]; - UV srch_ofs = cpos + sofs; + if ((sofs + cpos<0)||(sofs + cpos > upto)) { + while(stridx[idx]>0) idx++; + continue; + } - #ifdef trie_DEBUG - printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs); - #endif + srch_ofs = cpos + sofs; - x= (srch_ofs % strlen)*veclen; + #ifdef trie_DEBUG + printf("\tsofs: %d srch_ofs: %d\n",sofs,srch_ofs); + #endif - while (stridx[idx]>0) { + x= (srch_ofs % strlen)*veclen; - if ( srch_ofs>=0 && srch_ofs<=upto ) { - UV check_byte=x+(long)(stridx[idx]/8); - unsigned char check_bit=stridx[idx] % 8; - unsigned char seen=mask[check_bit] & seenvec[check_ +byte]; + while (stridx[idx]>0) { + UV check_byte=x+(long)(stridx[idx]/8); + unsigned char check_bit=stridx[idx] % 8; + unsigned char seen=mask[check_bit] & seenvec[check_byte]; - #ifdef trie_DEBUG - printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d +\n", - stridx[idx],check_byte,check_bit,seen); - #endif + #ifdef trie_DEBUG + printf("\tCheck: %d Byte:%d Bit:%d Seen:%d Set:%d\n", + stridx[idx],check_byte,check_bit,seen); + #endif - if (!seen) { - char *c= &strings[ (stridx[idx]-1)*bmaxlen ]; - char *s= &search_string[ srch_ofs ]; - UV diff=0; - num_fullchecks++; - seenvec[check_byte]=seenvec[check_byte] | mask[ch +eck_bit]; + if (!seen) { + char *c= &strings[ (stridx[idx]-1)*bmaxlen ]; + char *s= &search_string[ srch_ofs ]; + UV diff=0; + num_fullchecks++; + seenvec[check_byte]=seenvec[check_byte] | mask[check_bit]; - #ifdef trie_DEBUG - printf("\tsearching at ofs %d / strings_of %d ( +cpos:%d)\n",srch_ofs,stridx[idx],cpos); - printf("\t> C '%s' | S %s = ",c,s); - #endif + #ifdef trie_DEBUG + printf("\tsearching at ofs %d / strings_of %d (cpos:%d)\n", +srch_ofs,stridx[idx],cpos); + printf("\t> C '%s' | S %s = ",c,s); + #endif - while (*c && diff<=fuzz) { - if (*c++ != *s++) diff++; - } + while (*c && diff<=fuzz) { + if (*c++ != *s++) diff++; + } - #ifdef trie_DEBUG - printf("\tDiff: %d Fuzz:%d\n",diff,fuzz); - #endif + #ifdef trie_DEBUG + printf("\tDiff: %d Fuzz:%d\n",diff,fuzz); + #endif - if (diff <= fuzz) { - #ifdef trie_PACKED_RETURN - { - Match_Return ret; - #ifdef HAS_HTONL - ret.ofs=PerlSock_htonl(ofs); - ret.diff=PerlSock_htonl(diff); - ret.match=PerlSock_htonl((stridx[idx]-1)); - #else - ret.ofs=ofs; - ret.diff=diff; - ret.match=(stridx[idx]-1); - #endif - { - SV *sv=newSVpvn((char*)(&ret),sizeof(ret)); - av_push(return_array,sv); - } + if (diff <= fuzz) { +#ifdef trie_PACKED_RETURN + { + Match_Return ret; +#ifdef HAS_HTONL + ret.ofs=PerlSock_htonl(ofs); + ret.diff=PerlSock_htonl(diff); + ret.match=PerlSock_htonl((stridx[idx]-1)); +#else + ret.ofs=ofs; + ret.diff=diff; + ret.match=(stridx[idx]-1); +#endif + { + SV *sv=newSVpvn((char*)(&ret),sizeof(ret)); + av_push(return_array,sv); + } - } - #else - { - SV *sv=newSViv((UV)(srch_ofs)); - av_push(return_array,sv); - } - { - SV *sv=newSViv((IV)diff); - av_push(return_array,sv); - } - { - SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmax +len],0); - av_push(return_array,sv); - } - #endif - } - } - } - idx++; - } /* each string */ - } /* each ofs */ - } /* do we have a hit */ - } /* legal char? */ - } /* walk the string */ - { - SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullCheck +s",TRUE); - sv_setuv(num_fullchecks_sv,num_fullchecks); + } +#else + { + SV *sv=newSViv((UV)(srch_ofs)); + av_push(return_array,sv); + } + { + SV *sv=newSViv((IV)diff); + av_push(return_array,sv); + } + { + SV *sv=newSVpv(&strings[ (stridx[idx]-1)*bmaxlen],0); + av_push(return_array,sv); + } +#endif + } + } + idx++; + } } - return return_array; + } + { + SV *num_fullchecks_sv=get_sv("Fuzzy::Matcher::DFA::NumFullChecks" +,TRUE); + sv_setuv(num_fullchecks_sv,num_fullchecks); + } + return return_array; } +
    ---
    demerphq

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://413705]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others meditating upon the Monastery: (3)
As of 2024-04-18 23:04 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found