#!/usr/bin/perl
use Modern::Perl '2011';
use autodie;
use Benchmark qw/ timethese /;

# benchmark pattern searching using naive approach vs suffix arrays 

open my $fh, '<', shift;
chomp( my( $pattern, $str ) = <$fh> );

my $naive = sub {
	my @positions;
	while ( $str =~ /(?=($pattern))/g ) {
			push @positions, $-[0];;
		}
	#say "@positions";
};

my $sufbin = sub {
	$str .= '$';
	my @suff; # hold suffixes
	while ( 1 < length $str ) { # condition excludes '$' from @suff
		push @suff, $str;
		substr ( $str, 0, 1, '' );
	}

	# lexically ordered suffix array
	my @indices = sort { $suff[$a] cmp $suff[$b] } 0 .. $#suff;
	#for ( 0 .. $#indices ) {
	#	say $indices[$_]+1, ": ", $suff[ $indices[$_] ];
	#}

	my $start = bsearch( \ @indices, $pattern, \ @suff );
	# get consecutive positions, if any,
	# where pattern matches first n chars of suffix.
	my @positions;
	for my $index ( $start .. $#indices ) {
		last 
			if $suff[ $indices[ $index ] ] !~ /^$pattern/;
		push @positions, $indices[ $index ];# + 1; # omit +1 if 0-based indexing
	}

	# binary search.
	# find first potential match, ie first suffix after or equal to pattern,
	# such that pattern potentially matches first n characters of suffix. 
	sub bsearch {
		my ( $indref, $pat, $sufref ) = @_;
		my $mid;
		my ( $lo, $hi ) = ( 0, $#$indref );
		
		while ( 1 ) {
			$mid = int( ( $lo + $hi ) / 2 );
			return $mid if $hi == $lo;
			
			if( ( $pattern cmp $$sufref[ $$indref[ $mid ] ] ) < 0 ) {
				$hi = $mid;
			}
			else {
				$lo = $mid + 1;
			}
		}
	}
};

timethese( -5, { 
		Suffixbinary => $sufbin,
		Naive		 => $naive,
	} );
#output
abualiga:~$ ./benchmarkPatternSearch.pl patternSearchData.txt 
Benchmark: running Naive, Suffixbinary for at least 5 CPU seconds...
     Naive:  6 wallclock secs ( 5.32 usr +  0.00 sys =  5.32 CPU) @ 925.75/s (n=4925)
Suffixbinary:  6 wallclock secs ( 5.33 usr +  0.00 sys =  5.33 CPU) @ 232758.91/s (n=1240605)