#! perl -slw
use strict;
use Data::Dump qw[ pp ];
use Time::HiRes qw[ time ];

chomp( my @words = do{ local @ARGV = 'c:/test/words.txt'; <> } );

my %lexicon; $lexicon{ $_ } = 'suplementary data' for @words;


my $re = '(' . join( '|', sort{ length( $a ) <=> length( $b ) } @words ) . ')';

#print $re; exit;

open my $infile, '<', $ARGV[ 0 ] or die $!;

my $start1 = time;
seek $infile, 0, 0;
my( $words, $found1 ) = ( 0, 0 );
while( <$infile> ) {
    printf "\r$.\t";
    tr[a-zA-Z][ ]cs;
    for my $word ( split ) {
        ++$words;
        ++$found1 if exists $lexicon{ $word };
    }
}
my $end1 = time;

printf "Finding $found1 words (of $words) took %f seconds using a hash\n", $end1 - $start1;

my $start2 = time;
seek $infile, 0, 0; $. = 1;
my $found2 = 0;
while( <$infile> ) {
    printf "\r$.\t";
    tr[a-zA-Z][ ]cs;
    tr[A-Z][a-z];
    ++$found2 while m[$re]g;
}
my $end2 = time;

printf "Finding $found2 words took %f seconds using a trie(via regex engine)\n", $end2 - $start2;
__END__
C:\docs\OriginOfSpecies(Darwin)\2009-h>\perl5.18\bin\perl.exe \test\1043602.pl 2009-h.htm
Finding 203474 words (of 216808) took 0.173504 seconds using a hash
Finding 203474 words took 2072.099258 seconds using a trie(via regex engine)