#!/usr/bin/env perl use warnings; use strict; use open qw/:std :utf8/; use Time::HiRes qw/gettimeofday tv_interval/; use Devel::Size qw/total_size/; die "Usage: $0 [WORDFILE]\n" unless @ARGV<2; my $WORDFILE = shift || '/usr/share/dict/words'; my %TRIE; # for trie_search() my @ALLWORDS; # for re_search() open my $ifh, '<:utf8', $WORDFILE or die $!; my $cnt=0; my $t0_build = [gettimeofday]; while (<$ifh>) { chomp; next if substr($_,-2) eq "'s"; push @ALLWORDS, $_; my $ptr = \%TRIE; # pointer into trie $ptr = ($$ptr{$_} ||= {}) for split //; $$ptr{"\0"}=1; # end-of-word marker $cnt++; } my $build_time_s = tv_interval($t0_build); close $ifh; print "Built a trie out of $cnt words in $build_time_s s\n"; print "Size of trie: ".total_size(\%TRIE) ." bytes\n"; print "Size of array: ".total_size(\@ALLWORDS)." bytes\n"; # just some stats, comment out if not needed use List::Util qw/max/; my $maxlen = max(map {length} @ALLWORDS); print "Longest word(s): $maxlen letters\n"; print "Those are:\n"; print "\t$_\n" for grep {length==$maxlen} @ALLWORDS; # user input loop my ($avg_trie_s,$avg_re_s,$avg_cnt) = (0,0,0); while(1) { print "Enter a word (blank=exit): "; chomp(my $word = ); last unless $word; # time trie search my $t0_trie = [gettimeofday]; my @found = trie_search($word); my $trie_time_s = tv_interval($t0_trie); @found = sort @found; print "$_\n" for @found; print "--- trie: $trie_time_s s\n"; $avg_trie_s += $trie_time_s; # time regex search my $t0_re = [gettimeofday]; # swap re_search with xor_search here to test that my @re_found = re_search($word); my $trie_re_s = tv_interval($t0_re); @re_found = sort @re_found; print "$_\n" for @re_found; print "--- regex: $trie_re_s s\n"; $avg_re_s += $trie_re_s; $avg_cnt++; # compare results warn "Error: found/re_found array length mismatch\n" unless @found==@re_found; $found[$_] eq $re_found[$_] or warn "Error: element $_ mismatch: found=$found[$_], " ."re_found=$re_found[$_]\n" for 0..$#found; } if ($avg_cnt>1) { $avg_trie_s /= $avg_cnt; $avg_re_s /= $avg_cnt; print "Avg. trie time=$avg_trie_s s, ", "Avg. regex time=$avg_re_s s\n"; } sub trie_search { my ($word) = @_; my @found; my @lett = split //, $word; my $ptr = \%TRIE; # pointer into trie for my $i (0..$#lett) { # for each letter in the word # inspect all other options at letter $i for my $opt (keys %$ptr) { next if $opt eq "\0"; my $tptr = $$ptr{$opt}; # temp pointer # walk through the rest of the letters in the trie $tptr = $$tptr{$lett[$_]} or last for $i+1..$#lett; if ($tptr && $$tptr{"\0"}) { # is this a full word? # swap out the one letter substr(my $match = $word, $i, 1) = $opt; push @found, $match unless $match eq $word; } } $ptr = $$ptr{$lett[$i]}; # walk trie } return @found; } sub re_search { my ($word) = @_; my $re = join '|', # build a regex for this word map {quotemeta(substr($word,0,$_)).'.' .quotemeta(substr($word,$_+1))} 0 .. length($word)-1; $re = qr/^(?:$re)$/; my @found = grep {/$re/ && $_ ne $word} @ALLWORDS; return @found; } sub xor_search { my ($word) = @_; # by AnomalousMonk, http://perlmonks.org/?node_id=1161596 my @found = grep { length($_)==length($word) && ( ($word ^ $_) =~ tr/\x00//c ) == 1 } @ALLWORDS; return @found; }