Beefy Boxes and Bandwidth Generously Provided by pair Networks
Your skill will accomplish
what the force of many cannot
 
PerlMonks  

Re^6: Fast common substring matching

by marioroy (Vicar)
on Feb 18, 2016 at 00:01 UTC ( #1155508=note: print w/replies, xml ) Need Help??


in reply to Re^5: Fast common substring matching
in thread Fast common substring matching

Update: Important on Windows is starting the shared-manager process immediately if construction for the shared variable comes after loading data. Unix platforms benefit from Copy-on-Write feature which is great.

... use MCE::Hobo; use MCE::Shared; # For minimum memory consumption, start the shared-manager process bef +ore # loading data locally. MCE::Shared->start(); # <-- important on Windows my $minmatch = 4; my $startTime = [Time::HiRes::gettimeofday ()]; my %strings; while (<>) { chomp(my $label = $_); chomp(my $string = <>); # Compute all substrings @{$strings{$label}} = map [substr($string, $_), $label, $_], 0..(len +gth($string) - $minmatch); } print "Loaded. Generating combos...\n"; my @keys = sort keys %strings; my $sequence = MCE::Shared->sequence( { chunk_size => 1, bounds_only => 1 }, 0, $#keys - 1 ); ...



Hello Roy Johnson,

I am fascinated by the various examples posted here, here, and also the Inline C demonstration.

Your 2nd demonstration scales wonderfully on multiple cores after loading the strings hash. For testing, I made a file containing 48 sequences. The serial and parallel code complete in 22.6 seconds and 6.1 seconds respectively. My laptop has 4 real cores plus 4 hyper-threads.

First, the construction for MCE::Hobo. This requires a later 1.699_011 dev release or soon after the final MCE 1.7 release.

... print "Loaded. Generating combos...\n"; my @keys = sort keys %strings; # Now walk through the list. The best match for each string will be th +e # previous or next element in the list that is not from the original s +ubstring, # so for each entry, just look for the next one. See how many initial +letters # match and track the best matches use MCE::Hobo; use MCE::Shared; my $sequence = MCE::Shared->sequence( { chunk_size => 1, bounds_only => 1 }, 0, $#keys - 1 ); sub walk_list { my @best_overall_match = (0); # $beg and $end have the same values when chunk_size => 1 while ( my ( $beg, $end ) = $sequence->next ) { for my $ki1 ( $beg .. $end ) { for my $ki2 (($ki1 + 1)..$#keys) { my @strings = sort {$a->[0] cmp $b->[0]} @{$strings{$keys[$ki1 +]}}, @{$strings{$keys[$ki2]}}; my @matchdata = (0); # (length, index1-into-strings, index2-in +to-strings) for my $i1 (0..($#strings - 1)) { my $i2 = $i1 + 1; ++$i2 while $i2 <= $#strings and $strings[$i2][1] eq $string +s[$i1][1]; next if $i2 > $#strings; my ($common) = map length, ($strings[$i1][0] ^ $strings[$i2] +[0]) =~ /^(\0*)/; next if $common < $minmatch; if ($common > $matchdata[0]) { @matchdata = ($common, [$i1, $i2]); } elsif ($common == $matchdata[0]) { push @matchdata, [$i1, $i2]; } } next if $matchdata[0] < $minmatch; if ($matchdata[0] > $best_overall_match[0]) { @best_overall_match = ($matchdata[0]); } if ($matchdata[0] >= $best_overall_match[0]) { push @best_overall_match, map { ["$strings[$_->[0]][1]:$strings[$_->[0]][2]", "$strings[$_ +->[1]][1]:$strings[$_->[1]][2]"] } @matchdata[1..$#matchdata]; } } # $ki2 } # $ki1 } return @best_overall_match; }; MCE::Hobo->create( \&walk_list ) for 1 .. 8; my @best_overall_match = (0); for my $hobo ( MCE::Hobo->list ) { my @ret = $hobo->join; if ( $ret[0] > $best_overall_match[0] ) { @best_overall_match = @ret; } elsif ( $ret[0] == $best_overall_match[0] ) { shift @ret; push @best_overall_match, @ret; } } print "Best overall match: $best_overall_match[0] chars\n"; ...



MCE::Loop is next and does the same thing.

... print "Loaded. Generating combos...\n"; my @keys = sort keys %strings; # Now walk through the list. The best match for each string will be th +e # previous or next element in the list that is not from the original s +ubstring, # so for each entry, just look for the next one. See how many initial +letters # match and track the best matches use MCE::Loop; MCE::Loop::init( max_workers => 8, chunk_size => 1, bounds_only => 1, ); my @ret = mce_loop_s { my ( $mce, $seq, $chunk_id ) = @_; my @best_overall_match = (0); # $seq->[0] and $seq->[1] have the same values when chunk_size => 1 for my $ki1 ( $seq->[0] .. $seq->[1] ) { for my $ki2 (($ki1 + 1)..$#keys) { my @strings = sort {$a->[0] cmp $b->[0]} @{$strings{$keys[$ki1]} +}, @{$strings{$keys[$ki2]}}; my @matchdata = (0); # (length, index1-into-strings, index2-into +-strings) for my $i1 (0..($#strings - 1)) { my $i2 = $i1 + 1; ++$i2 while $i2 <= $#strings and $strings[$i2][1] eq $strings[ +$i1][1]; next if $i2 > $#strings; my ($common) = map length, ($strings[$i1][0] ^ $strings[$i2][0 +]) =~ /^(\0*)/; next if $common < $minmatch; if ($common > $matchdata[0]) { @matchdata = ($common, [$i1, $i2]); } elsif ($common == $matchdata[0]) { push @matchdata, [$i1, $i2]; } } next if $matchdata[0] < $minmatch; if ($matchdata[0] > $best_overall_match[0]) { @best_overall_match = ($matchdata[0]); } if ($matchdata[0] >= $best_overall_match[0]) { push @best_overall_match, map { ["$strings[$_->[0]][1]:$strings[$_->[0]][2]", "$strings[$_-> +[1]][1]:$strings[$_->[1]][2]"] } @matchdata[1..$#matchdata]; } } # $ki2 } # $ki1 MCE->gather(\@best_overall_match); } 0, $#keys - 1; MCE::Loop::finish; my @best_overall_match = (0); for my $i ( 0 .. $#ret ) { if ($ret[$i]->[0] > $best_overall_match[0]) { @best_overall_match = @{ $ret[$i] }; } elsif ( $ret[$i]->[0] == $best_overall_match[0] ) { shift @{ $ret[$i] }; push @best_overall_match, @{ $ret[$i] }; } } print "Best overall match: $best_overall_match[0] chars\n"; ...



This has been a lot of fun. I learned some more Perl from it all.

Regards, Mario

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://1155508]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others avoiding work at the Monastery: (9)
As of 2019-10-14 22:41 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?
    Notices?