#!/usr/bin/perl use Modern::Perl '2011'; use autodie; use Benchmark qw/ timethese /; # benchmark pattern searching using naive approach vs suffix arrays open my $fh, '<', shift; chomp( my( $pattern, $str ) = <$fh> ); my $naive = sub { my @positions; while ( $str =~ /(?=($pattern))/g ) { push @positions, $-[0];; } #say "@positions"; }; my $sufbin = sub { $str .= '$'; my @suff; # hold suffixes while ( 1 < length $str ) { # condition excludes '$' from @suff push @suff, $str; substr ( $str, 0, 1, '' ); } # lexically ordered suffix array my @indices = sort { $suff[$a] cmp $suff[$b] } 0 .. $#suff; #for ( 0 .. $#indices ) { # say $indices[$_]+1, ": ", $suff[ $indices[$_] ]; #} my $start = bsearch( \ @indices, $pattern, \ @suff ); # get consecutive positions, if any, # where pattern matches first n chars of suffix. my @positions; for my $index ( $start .. $#indices ) { last if $suff[ $indices[ $index ] ] !~ /^$pattern/; push @positions, $indices[ $index ];# + 1; # omit +1 if 0-based indexing } # binary search. # find first potential match, ie first suffix after or equal to pattern, # such that pattern potentially matches first n characters of suffix. sub bsearch { my ( $indref, $pat, $sufref ) = @_; my $mid; my ( $lo, $hi ) = ( 0, $#$indref ); while ( 1 ) { $mid = int( ( $lo + $hi ) / 2 ); return $mid if $hi == $lo; if( ( $pattern cmp $$sufref[ $$indref[ $mid ] ] ) < 0 ) { $hi = $mid; } else { $lo = $mid + 1; } } } }; timethese( -5, { Suffixbinary => $sufbin, Naive => $naive, } ); #output abualiga:~$ ./benchmarkPatternSearch.pl patternSearchData.txt Benchmark: running Naive, Suffixbinary for at least 5 CPU seconds... Naive: 6 wallclock secs ( 5.32 usr + 0.00 sys = 5.32 CPU) @ 925.75/s (n=4925) Suffixbinary: 6 wallclock secs ( 5.33 usr + 0.00 sys = 5.33 CPU) @ 232758.91/s (n=1240605)