Thanks again Tye!
I incorporated your changes: bsearch() is no longer accessing global variables, output is adjusted for a 'no match' case, and Modern::Perl does turn on warnings by default.
#!/usr/bin/perl
use Modern::Perl '2011';
use autodie;
# pattern searching using suffix arrays and binary search
open my $fh, '<', shift;
chomp( my( $pattern, $str ) = <$fh> );
$str .= '$';
my @suff; # hold suffixes
while ( 1 < length $str ) { # condition excludes '$' from @suff
push @suff, $str;
substr ( $str, 0, 1, '' );
}
# lexically ordered suffix array
my @indices = sort { $suff[$a] cmp $suff[$b] } 0 .. $#suff;
#for ( 0 .. $#indices ) {
# say $indices[$_]+1, ": ", $suff[ $indices[$_] ];
#}
my $start = bsearch( \ @indices, $pattern, \ @suff );
# get consecutive positions, if any,
# where pattern matches first n chars of suffix.
my @positions;
for my $index ( $start .. $#indices ) {
last
if $suff[ $indices[ $index ] ] !~ /^$pattern/;
push @positions, $indices[ $index ] + 1; # omit +1 if 0-based inde
+xing
}
# print results
if ( @positions ) { # check if a match exists at all
say "\nPattern \"$pattern\" found at position:";
for my $pos ( sort{ $a <=> $b } @positions ) {
#chop( my $s = $suff[ $pos - 1] ); #suffix
print "$pos ";
}
}
else {
say "\nPattern \"$pattern\" not found in string";
}
print "\n";
# binary search.
# find first potential match, ie first suffix after or equal to patter
+n,
# such that pattern potentially matches first n characters of suffix.
sub bsearch {
my ( $indref, $pat, $sufref ) = @_;
my $mid;
my ( $lo, $hi ) = ( 0, $#$indref );
while ( 1 ) {
$mid = int( ( $lo + $hi ) / 2 );
return $mid if $hi == $lo;
if( ( $pattern cmp $$sufref[ $$indref[ $mid ] ] ) < 0 ) {
$hi = $mid;
}
else {
$lo = $mid + 1;
}
}
}
I did a simple benchmark relative to what you would call a 'naive' pattern search, after checking that both give the same results, with a 9-char pattern and 8569-char string. I like the speed up.
#!/usr/bin/perl
use Modern::Perl '2011';
use autodie;
use Benchmark qw/ timethese /;
# benchmark pattern searching using naive approach vs suffix arrays
open my $fh, '<', shift;
chomp( my( $pattern, $str ) = <$fh> );
my $naive = sub {
my @positions;
while ( $str =~ /(?=($pattern))/g ) {
push @positions, $-[0];;
}
#say "@positions";
};
my $sufbin = sub {
$str .= '$';
my @suff; # hold suffixes
while ( 1 < length $str ) { # condition excludes '$' from @suff
push @suff, $str;
substr ( $str, 0, 1, '' );
}
# lexically ordered suffix array
my @indices = sort { $suff[$a] cmp $suff[$b] } 0 .. $#suff;
#for ( 0 .. $#indices ) {
# say $indices[$_]+1, ": ", $suff[ $indices[$_] ];
#}
my $start = bsearch( \ @indices, $pattern, \ @suff );
# get consecutive positions, if any,
# where pattern matches first n chars of suffix.
my @positions;
for my $index ( $start .. $#indices ) {
last
if $suff[ $indices[ $index ] ] !~ /^$pattern/;
push @positions, $indices[ $index ];# + 1; # omit +1 if 0-base
+d indexing
}
# binary search.
# find first potential match, ie first suffix after or equal to pa
+ttern,
# such that pattern potentially matches first n characters of suff
+ix.
sub bsearch {
my ( $indref, $pat, $sufref ) = @_;
my $mid;
my ( $lo, $hi ) = ( 0, $#$indref );
while ( 1 ) {
$mid = int( ( $lo + $hi ) / 2 );
return $mid if $hi == $lo;
if( ( $pattern cmp $$sufref[ $$indref[ $mid ] ] ) < 0 ) {
$hi = $mid;
}
else {
$lo = $mid + 1;
}
}
}
};
timethese( -5, {
Suffixbinary => $sufbin,
Naive => $naive,
} );
#output
abualiga:~$ ./benchmarkPatternSearch.pl patternSearchData.txt
Benchmark: running Naive, Suffixbinary for at least 5 CPU seconds...
Naive: 6 wallclock secs ( 5.32 usr + 0.00 sys = 5.32 CPU) @ 92
+5.75/s (n=4925)
Suffixbinary: 6 wallclock secs ( 5.33 usr + 0.00 sys = 5.33 CPU) @
+232758.91/s (n=1240605)