Beefy Boxes and Bandwidth Generously Provided by pair Networks
go ahead... be a heretic
 
PerlMonks  

Re: Challenge: "Words" In A String

by ikegami (Patriarch)
on Sep 19, 2008 at 05:48 UTC ( [id://712455]=note: print w/replies, xml ) Need Help??


in reply to Challenge: "Words" In A String

There.

A trie is not only smaller, it's faster than a hash in this situation.

  • Hash: O(N3), where N is the size of the input string.

    for my $i (0..$N-1) { for my $j ($i..$N-1) { my $substr = substr($s, $i, $j-$i+1); if ( exists( $hash{$substr} ) { ... } } }

    Remember, both the hash lookup and the substr iterate over (a portion of) the input string.

  • Trie: O(N2), where N is the size of the input string.

    for my $i (0..$N-1) { my $substr = substr($s, $i); my @results = $trie->lookup($substr); for (@results) { ... } }

Then I build a tree that differentiates word branches from non-word branches. There are comments, but they're rather cryptic. I included debug output to help understand.

use strict; use warnings; use constant DEBUG => 1; use constant DICT => "2of4brif.txt"; my $trie; sub load_dict { # # Constructs a trie from the dictionary. # open(my $fh, '<', DICT) or die("Unable to open dictionary \"" . DICT . "\": $!\n"); while (<$fh>) { chomp; my $p = \$trie; for ( split(//, $_), "\0\0" ) { $p = \( $$p->{$_} ); } } } sub words_from { my ($str) = @_; my @letters = split(//, $str); my @lengths; my $p = $trie; my $i = 0; for my $i ( 0 .. $#letters ) { last if !exists( $p->{ $letters[$i] } ); $p = $p->{ $letters[$i] }; push @lengths, $i+1 if exists( $p->{ "\0\0" } ); } return @lengths; } sub find_substrs { my ($str) = @_; my @w_substrs; { # # First, construct the following structure from the input: # # p e n i s l a n d # ------------------- # [p e n] # [p e n i s] # [i s] # [i s l a n d] # [l a n d] # [a n] # [a n d] # ------------------- # 3 2 4 2 # 5 6 3 # for my $i ( 0 .. length( $str )-1 ) { $w_substrs[$i] = [ words_from( substr( $str, $i ) ) ]; } } if (DEBUG) { require Data::Dumper; Data::Dumper->import(qw( Dumper )); no warnings 'once'; local $Data::Dumper::Terse = 1; local $Data::Dumper::Indent = 0; print( 'w_substrs: ', Dumper(\@w_substrs), "\n" ); } my @n_substrs; { # # Then, construct the following structure from the input: # # ------------------- # p e n i s l a n d # ------------------- # [p] <-- Delete (Leads to nothing) # [p e] <-- Delete (Leads to nothing) # [p e n] <-- Delete and proceed (Word) # [e] <-- Delete (Leads to nothing) # [e n] <-- Keep (Leads to "is") # [e n i] <-- Delete (Leads to nothing) # [e n i s] <-- Delete and proceed (Word) # [n] <-- Keep (Leads to "is") # [n i] <-- Delete (Leads to nothing) # [n i s] <-- Delete and proceed (Word) # [s] <-- Keep (Leads to "land") # [s l] <-- Keep (Leads to "an") # [s l a] <-- Delete (Leads to nothing) # [s l a n] <-- Delete and proceed (Word) # [l] <-- Delete (Leads to nothing) # [l a] <-- Delete (Leads to nothing) # [l a n] <-- Delete and proceed (Word) # [a] <-- Delete (Leads to nothing) # [a n] <-- Delete and proceed (Word) # [n] <-- Delete (Leads to nothing) # [n d] <-- Keep (Leads to end) # [d] <-- Keep (Leads to end) # ------------------- # 2 1 1 1 2 1 # 2 # # The actual implementation differs from above. # While the worse case is O(N^2), the usual # case is far more likely to resemble O(N). # my $j = @w_substrs; for my $i ( reverse 0 .. $#w_substrs ) { if ( @{$w_substrs[$i]} && $j-$i >= $w_substrs[$i][0] ) { $n_substrs[$i] = [ ]; } elsif ( $j == @w_substrs ) { $n_substrs[$i] = [ $j-$i ]; } else { $n_substrs[$i] = [ map { $_+($j-$i) } 0, @{ $n_substrs[$j] + } ]; } if ( @{$w_substrs[$i]} ) { $j = $i; } } } if (DEBUG) { require Data::Dumper; Data::Dumper->import(qw( Dumper )); no warnings 'once'; local $Data::Dumper::Terse = 1; local $Data::Dumper::Indent = 0; print( 'n_substrs: ', Dumper(\@n_substrs), "\n" ); } return [ \@w_substrs, \@n_substrs ]; } sub list_substrs { my ($str, $substrs) = @_; my ($w_substrs, $n_substrs) = @$substrs; local *w_helper = sub { my ($i) = @_; my @results; for my $l ( @{ $w_substrs->[$i] } ) { my $substr = substr( $str, $i, $l ); if ($i + $l == @$w_substrs) { push @results, [ $substr ]; } else { push @results, map [ $substr, @$_ ], n_helper( $i + $l ); } } return @results; }; local *n_helper = sub { my ($i) = @_; my @results = w_helper( $i ); for my $l ( @{ $n_substrs->[$i] } ) { my $substr = "[" . substr( $str, $i, $l ) . "]"; if ($i + $l == @$n_substrs) { push @results, [ $substr ]; } else { push @results, map [ $substr, @$_ ], w_helper( $i + $l ); } } return @results; }; return map join( ' ', @{$_->[0]} ), sort { $a->[1] <=> $b->[1] || $a->[2] <=> $b->[2] } map [ $_, scalar(grep /^\[/, @$_), scalar(@$_) ], n_helper(0); } { load_dict(); for my $input (qw( penisland zatxtaz xapenx )) { print( "$input\n" ); print( ( "-" x length($input) ), "\n" ); my $substrs = find_substrs( $input ); for ( list_substrs( $input, $substrs ) ) { print( "$_\n" ); } print( "\n" ); } }
penisland --------- w_substrs: [[3,5],[],[],[2,6],[],[4],[2,3],[],[]] n_substrs: [[],[2],[1],[],[1,2],[1],[],[2],[1]] pen island penis land pen is land penis [l] and pen is [l] and penis [l] an [d] pen is [l] an [d] zatxtaz ------- w_substrs: [[],[2],[],[],[2],[],[]] n_substrs: [[1],[],[2],[1],[],[2],[1]] [z] at [x] ta [z] xapenx ------ w_substrs: [[],[3],[3],[],[],[]] n_substrs: [[1,2],[1],[],[3],[2],[1]] [x] ape [nx] [xa] pen [x]

Replies are listed 'Best First'.
Re^2: Challenge: "Words" In A String
by gone2015 (Deacon) on Sep 19, 2008 at 14:18 UTC

    A trie ! Beautiful :-)

    For repeated use it's obvious to consider preprocessing the dictionary into the trie, and reading that each time.

    Of interest is how beautifully this trie will compress, by folding the matching sub-tries. Code to illustrate this given below. On my machine the result is:

    Loading dictionary '2of4brif.txt'... done -- 1.30 Secs Writing trie '2of4brif.trie'... 22599/138676 done -- 1.35 Secs Reading trie '2of4brif.trie'... done -- 0.25 Secs Walking the tries... done -- 1.94 Secs Original trie is: 27.1M, new trie is: 5.4M
    showing the degree of compression (22599/138676), and that reading the trie takes 0.25 Secs where loading the dictionary takes 1.30 Secs. Result :-)

    (The memory footprint is also reduced, but that's not hugely significant.)


    use strict; use warnings; use constant DICT => "2of4brif.txt"; use constant TRIE => "2of4brif.trie" ; sub get_cpu { my $t = (times)[0] ; if (@_) { $t = sprintf('%4.2f Secs', $t - $_[0]) ; } ; return $t ; } ; my $took ; print STDERR "Loading dictionary '", DICT, "'" ; $took = get_cpu() ; my $trie = load_dict(DICT) ; print STDERR "... done -- ", get_cpu($took), "\n" ; print STDERR "Writing trie '", TRIE, "'" ; $took = get_cpu() ; my ($now, $was) = write_trie(TRIE, $trie) ; print STDERR "... done -- ", get_cpu($took), "\n" ; print STDERR "Reading trie '", TRIE, "'" ; $took = get_cpu() ; my $check = read_trie(TRIE) ; print STDERR "... done -- ", get_cpu($took), "\n" ; print STDERR "Walking the tries" ; $took = get_cpu() ; walk('', $trie, $check) ; print STDERR "... done -- ", get_cpu($took), "\n" ; use Devel::Size qw(total_size) ; printf STDERR "Original trie is: %3.1fM, new trie is: %3.1fM\n", total_size( $trie)/(1024 * 1024), total_size($check)/(1024 * 1024) ; #=================================================================== +====================== # Loading the Dictionary into the Trie. sub load_dict { my ($d_name) = @_ ; # Constructs a trie from the dictionary. open(my $fh, '<', $d_name) or die("Unable to open dictionary '$d_name': $!\n"); my $trie = undef ; while (<$fh>) { s/\s+$// ; # chomp; my $p = \$trie; for ( split(//, $_), '!' ) { $p = \( $$p->{$_} ) ; } } return $trie ; } #=================================================================== +====================== # Writing the Trie. my %node_map ; my $idx ; my $node_count ; sub write_trie { my ($t_name, $trie) = @_ ; open(my $fh, '>', $t_name) or die("Unable to create trie '$t_name': $!\n"); $node_count = 0 ; $idx = 1 ; # Index 0 reserved for what '!' points at ! %node_map = ('!' => "$idx") ; # Preset end of word node write_node($fh, $trie) ; close $fh ; return ($idx, $node_count) ; } ; sub write_node { my ($fh, $node) = @_ ; $node_count++ ; my @chs = sort keys %$node ; # NB '!' sorts to the front ! my @n = $chs[0] eq '!' ? (shift @chs) : () ; foreach my $ch (@chs) { my $p = write_node($fh, $node->{$ch}) ; push @n, $ch.$p ; } ; my $n = join(' ', @n) ; my $p ; unless (defined($p = $node_map{$n})) { $p = $node_map{$n} = sprintf('%X', ++$idx) ; # Assign new index and record print $fh $n, "\n" ; # Output new node } ; return $p ; } ; #=================================================================== +====================== # Reading the Trie. my @nodes ; sub read_trie { my ($t_name) = @_ ; open(my $fh, '<', $t_name) or die("Unable to open trie '$t_name': $!\n"); @nodes = (undef, {'!' => undef}) ; while (<$fh>) { push @nodes, { map { my ($c, $p) = unpack('AA*', $_) ; ($c, $nodes[hex($p)]) } split } ; # Note hex('') += 0 } ; close $fh ; my $trie = pop @nodes ; @nodes = () ; return $trie ; } ; #=================================================================== +====================== # Walk two tries to check they are identical sub walk { my ($w, $ra, $rb) = @_ ; my @ac = sort keys %$ra ; my @bc = sort keys %$rb ; if (@ac != @bc) { die "node length mismatch \@ '$w': (@ac) vs (@bc +)" ; } ; for my $i (0..$#ac) { if ($ac[$i] ne $bc[$i]) { die "node mismatch \@ '$w': (@ac) vs ( +@bc)" ; } ; } ; foreach my $ch (@ac) { my $ad = $ra->{$ch} ; my $bd = $rb->{$ch} ; if ($ch eq '!') { if (defined($ad)) { die "'!' with defined down ($ad) in 'a' \@ + '$w'" ; } ; if (defined($bd)) { die "'!' with defined down ($bd) in 'b' \@ + '$w'" ; } ; } else { walk($w.$ch, $ad, $bd) ; } ; } ; } ;
Re^2: Challenge: "Words" In A String
by Limbic~Region (Chancellor) on Sep 19, 2008 at 13:59 UTC
    ikegami,
    I am not sure when I will get a chance to decipher this but thank you. My unfinished solution is derived from the code I wrote here.

    Another idea I had but passed on was a two pass approach. The first pass would mark where each word in the dictionary overlapped the target word. The second pass would use a heuristic approach to a variation of the bin packing problem. I abandoned it because it seemed unnecessary given the expected length of input strings would usually be less than 30 characters and contain 2 to 3 words from the dictionary.

    Cheers - L~R

Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Node Status?
node history
Node Type: note [id://712455]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others wandering the Monastery: (3)
As of 2024-04-24 21:35 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found