#!/usr/bin/perl # POD can be found at the bottom of this script use strict; use warnings; use Compress::Zlib; use Getopt::Long; use Pod::Usage; my $VERSION = 0.81; my $dictfile = 'dict.gz'; # Process command-line options my %cl_options = ( help => '', version => '', token_debug => '', glossary_output => '', dictionary_output => '' ); GetOptions( 'help|?' => \$cl_options{help}, 'version' => \$cl_options{version}, 'man' => \$cl_options{man}, 'token-debug' => \$cl_options{token_debug}, 'glossary' => \$cl_options{glossary_output}, 'dictionary' => \$cl_options{dictionary_output} ); print "This is version $VERSION of $0.\n" if $cl_options{version}; exit(0) if ($cl_options{version}); pod2usage(-exitstatus => 0, -verbose => 1, -msg => "Help for $0") if $cl_options{help}; pod2usage(-exitstatus => 0, -verbose => 2, -msg => "Man page for $0") if $cl_options{man}; my $file = shift; my %dictionary = readdict(\$dictfile); my %glossary; findwords(); printlexicon(\%dictionary) if $cl_options{dictionary_output}; printlexicon(\%glossary) if $cl_options{glossary_output}; # Readdict reads in the dictionary file defined above using # the Compress:Zlib CPAN module. It returns a hash that is # used for all further dictionary operations. # sub readdict { my $dict = shift; my %dicthash; my $gz = gzopen($$dict, "rb") or die "Cannot open $$dict: $gzerrno\n" ; while ($gz->gzreadline($_) > 0) { chomp; $dicthash{lc($_)} = 0; } die "Error reading from $$dict: $gzerrno\n" if $gzerrno != Z_STREAM_END ; return %dicthash; } # findwords() reads in a file and compares words found in the file # with the contents of the dictionary read in by the readdict # function. It assigns counts to the elements of %dictionary and # creates %glossary elements and increases its values according to # the number of matches. sub findwords { open my $if, "<", $file || die "Could not open $file: $!"; while (<$if>) { chomp; my @elements = split(/[ '-]/,$_); # split on hyphens, too foreach my $element (@elements) { next if $element =~ /\d/; # Don't need digits print "[$element]->" if $cl_options{token_debug}; $element = lc($element); $element =~ s/[\s,!?._;«»)("'-]//g; print "[$element]\n" if $cl_options{token_debug}; next if $element eq ''; if ( exists $dictionary{$element} ) { $dictionary{$element}++; } else { $glossary{$element}++; } } } } # Showmatches reads in a lexicon hash via a reference and prints all words out # that have been seen in the findwords() function along with a frequency count. # sub printlexicon { my $lexicon = shift; my $counter = 0; foreach my $key (sort keys %$lexicon) { if ( $$lexicon{$key} > 0 ) { print $key . " : " . $$lexicon{$key} . "\n"; $counter++; } } print "\n$counter entries total\n"; } __END__ =pod =head1 dict-compare A generic script for building dictionaries by comparing them to real-world texts. =head1 DESCRIPTION This program compares the words in a given text file to a list of words from a dictionary file. It is capable of outputting lists of words that occur or do not occur in a given dictionary file, along with their frequency in the text. Debugging output using token tag marks is also available. =head1 SYNOPSIS C output_file> =head2 OPTIONS =over 12 =item C<--help,-h,-?> Prints a usage help screen. =item C<--man,-m> Prints out the manual entry for $0 =item C<--version,-v> Prints out the program version. =item C<--glossary> Prints a glossary of words not found in the dictionary file and the number of times they occur. =item C<--dictionary> Prints out the words from the text that had a dictionary match, along with their respective frequencies. =item C<--token-debug> Prints tags around each token in the text to help sound out strange tokens. The tokens themselves are printed side-by-side to show how the script cleans up the results. =back =head1 EXAMPLE C This command reads in the text contained in myfile.txt and prints out a list of words not found in the dictionary and their frequencies. =back =head1 DICTIONARY FORMAT The dictionary is a one-word-per-line file that has been gzipped. Your dictionary can be anything. Think of the possibilities. =head1 THANKS The following people have reviewed and offered inprovements to this code: =over 12 =item B L =item B L =item B L =item B L =item B L =back And of course all of the others at the Monastery, Cologne.pm whose help can only be seen in its cumulative effect. =head1 AUTHOR Damon "allolex" Davison - =head1 LICENSE This code is released under the same terms as Perl itself. =cut