http://www.perlmonks.org?node_id=712455


in reply to Challenge: "Words" In A String

There.

A trie is not only smaller, it's faster than a hash in this situation.

  • Hash: O(N3), where N is the size of the input string.

    for my $i (0..$N-1) { for my $j ($i..$N-1) { my $substr = substr($s, $i, $j-$i+1); if ( exists( $hash{$substr} ) { ... } } }

    Remember, both the hash lookup and the substr iterate over (a portion of) the input string.

  • Trie: O(N2), where N is the size of the input string.

    for my $i (0..$N-1) { my $substr = substr($s, $i); my @results = $trie->lookup($substr); for (@results) { ... } }

Then I build a tree that differentiates word branches from non-word branches. There are comments, but they're rather cryptic. I included debug output to help understand.

use strict; use warnings; use constant DEBUG => 1; use constant DICT => "2of4brif.txt"; my $trie; sub load_dict { # # Constructs a trie from the dictionary. # open(my $fh, '<', DICT) or die("Unable to open dictionary \"" . DICT . "\": $!\n"); while (<$fh>) { chomp; my $p = \$trie; for ( split(//, $_), "\0\0" ) { $p = \( $$p->{$_} ); } } } sub words_from { my ($str) = @_; my @letters = split(//, $str); my @lengths; my $p = $trie; my $i = 0; for my $i ( 0 .. $#letters ) { last if !exists( $p->{ $letters[$i] } ); $p = $p->{ $letters[$i] }; push @lengths, $i+1 if exists( $p->{ "\0\0" } ); } return @lengths; } sub find_substrs { my ($str) = @_; my @w_substrs; { # # First, construct the following structure from the input: # # p e n i s l a n d # ------------------- # [p e n] # [p e n i s] # [i s] # [i s l a n d] # [l a n d] # [a n] # [a n d] # ------------------- # 3 2 4 2 # 5 6 3 # for my $i ( 0 .. length( $str )-1 ) { $w_substrs[$i] = [ words_from( substr( $str, $i ) ) ]; } } if (DEBUG) { require Data::Dumper; Data::Dumper->import(qw( Dumper )); no warnings 'once'; local $Data::Dumper::Terse = 1; local $Data::Dumper::Indent = 0; print( 'w_substrs: ', Dumper(\@w_substrs), "\n" ); } my @n_substrs; { # # Then, construct the following structure from the input: # # ------------------- # p e n i s l a n d # ------------------- # [p] <-- Delete (Leads to nothing) # [p e] <-- Delete (Leads to nothing) # [p e n] <-- Delete and proceed (Word) # [e] <-- Delete (Leads to nothing) # [e n] <-- Keep (Leads to "is") # [e n i] <-- Delete (Leads to nothing) # [e n i s] <-- Delete and proceed (Word) # [n] <-- Keep (Leads to "is") # [n i] <-- Delete (Leads to nothing) # [n i s] <-- Delete and proceed (Word) # [s] <-- Keep (Leads to "land") # [s l] <-- Keep (Leads to "an") # [s l a] <-- Delete (Leads to nothing) # [s l a n] <-- Delete and proceed (Word) # [l] <-- Delete (Leads to nothing) # [l a] <-- Delete (Leads to nothing) # [l a n] <-- Delete and proceed (Word) # [a] <-- Delete (Leads to nothing) # [a n] <-- Delete and proceed (Word) # [n] <-- Delete (Leads to nothing) # [n d] <-- Keep (Leads to end) # [d] <-- Keep (Leads to end) # ------------------- # 2 1 1 1 2 1 # 2 # # The actual implementation differs from above. # While the worse case is O(N^2), the usual # case is far more likely to resemble O(N). # my $j = @w_substrs; for my $i ( reverse 0 .. $#w_substrs ) { if ( @{$w_substrs[$i]} && $j-$i >= $w_substrs[$i][0] ) { $n_substrs[$i] = [ ]; } elsif ( $j == @w_substrs ) { $n_substrs[$i] = [ $j-$i ]; } else { $n_substrs[$i] = [ map { $_+($j-$i) } 0, @{ $n_substrs[$j] + } ]; } if ( @{$w_substrs[$i]} ) { $j = $i; } } } if (DEBUG) { require Data::Dumper; Data::Dumper->import(qw( Dumper )); no warnings 'once'; local $Data::Dumper::Terse = 1; local $Data::Dumper::Indent = 0; print( 'n_substrs: ', Dumper(\@n_substrs), "\n" ); } return [ \@w_substrs, \@n_substrs ]; } sub list_substrs { my ($str, $substrs) = @_; my ($w_substrs, $n_substrs) = @$substrs; local *w_helper = sub { my ($i) = @_; my @results; for my $l ( @{ $w_substrs->[$i] } ) { my $substr = substr( $str, $i, $l ); if ($i + $l == @$w_substrs) { push @results, [ $substr ]; } else { push @results, map [ $substr, @$_ ], n_helper( $i + $l ); } } return @results; }; local *n_helper = sub { my ($i) = @_; my @results = w_helper( $i ); for my $l ( @{ $n_substrs->[$i] } ) { my $substr = "[" . substr( $str, $i, $l ) . "]"; if ($i + $l == @$n_substrs) { push @results, [ $substr ]; } else { push @results, map [ $substr, @$_ ], w_helper( $i + $l ); } } return @results; }; return map join( ' ', @{$_->[0]} ), sort { $a->[1] <=> $b->[1] || $a->[2] <=> $b->[2] } map [ $_, scalar(grep /^\[/, @$_), scalar(@$_) ], n_helper(0); } { load_dict(); for my $input (qw( penisland zatxtaz xapenx )) { print( "$input\n" ); print( ( "-" x length($input) ), "\n" ); my $substrs = find_substrs( $input ); for ( list_substrs( $input, $substrs ) ) { print( "$_\n" ); } print( "\n" ); } }
penisland --------- w_substrs: [[3,5],[],[],[2,6],[],[4],[2,3],[],[]] n_substrs: [[],[2],[1],[],[1,2],[1],[],[2],[1]] pen island penis land pen is land penis [l] and pen is [l] and penis [l] an [d] pen is [l] an [d] zatxtaz ------- w_substrs: [[],[2],[],[],[2],[],[]] n_substrs: [[1],[],[2],[1],[],[2],[1]] [z] at [x] ta [z] xapenx ------ w_substrs: [[],[3],[3],[],[],[]] n_substrs: [[1,2],[1],[],[3],[2],[1]] [x] ape [nx] [xa] pen [x]