#!/usr/local/bin/perl -w ############################################################################################## # ngram.plx # v 0.2 # by Mike Phenicie (TheEnigma on PerlMonks - www.perlmonks.org) # 09-18-04 ############################################################################################## # Background: An analysis can be made of text samples, in which the frequency of the various # ngrams is calulated. An ngram is my term for character sequences of length n. I have seen # 2 character sequences refered to as both digrams and digraphs; and 3 character sequences # refered to as trigrams and trigraphs. So to refer in a generic way to sequences of any # length, I chose ngram. # # This program scans through a text file, and looks at every sequence of length n-1, noting # what character comes after it. It will create a hash of hashes that contains the number # of occurances of a particular character following a particular n-1 length sequence. In # other words, if there are 10 places in the text where 'te' is followed by 'r', then # $freq{'te'}->{'r'} == 10. # # This frequency table can be used to create a new text that has similarities to the original # text file. The new file will contain text with a very similar ratio of ngrams. For instance, # if the original text had a lot of occasions where the letters 'ne' were followed by # an 's' (with n=3), but not many occasions where it was followed by a 'w', then that should # be true in the new text as well. # # With low numbers of n (three or four, for instance), the only similarity might be that it # seems to be in the same language as the input. With higher values for n, the output will # start to resemble the input more and more. With a high enough value it's possible you might # get the exact same text for output as was input. ############################################################################################## # Switches # # -i file to analyze # -o computer generated text # -d default 3 # -v (1 to 10k) default 100 (how long the output is) # -s save frequency hash to a file # -l use existing frequency file for input # (can't specify -i or -s with this) # -t <"starting text"> default is random by script # -a a human readable frequency file # -e examine - ignores all other switches # -D (no display on monitor) if switch present, no display on monitor # -A (append) append to output file # # Usage: You must specify an -i or an -l (but not both) to tell the program what to use as input # You may specify an -s, but if you do, you can't use -l # You may specify an -o, and if you optionally also use -A, the file will be appended # The generated text will by default go to the monitor, turn this off with -D # You may specify -t to tell it what text you want to start with # You may specify -a to store the frequency data in a human readable file # You may specify an -e (which will ignore all other switches) to examine a freq file # If you don't specify -d, a default of 3 will be used # If you don't specify -v, a default of 100 will be used ############################################################################################### # to do # add help # make line breaks occur on spaces # try mixing two or more frequency files to create one output # prevent it from printing bad start text to the output file # # and...? ########################################################################## use strict; use Getopt::Std; use Storable qw(nstore); $| = 1; srand; my $name = "ngram.plx v 0.2"; my $wrap = 80; my(%freq, $ngram, $next_char, $freq_file, $freq_depth, $freq_name, $freq, @freq); # Get command line options. my($input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append) = get_options(); if($examine){ $freq = Storable::retrieve("$examine") or die "Can't load frequency analysis data from $examine: $!"; ($freq_name, $freq_file, $freq_depth, %freq) = @$freq; print "\n\nFrequency file $examine created by $freq_name\n"; print "Based on text from $freq_file\n"; print "Depth = $freq_depth\n\n"; exit; } ################################################################## # Put frequency data in %freq, either by analyzing an input file, # or reading a previously saved frequency file. ################################################################## if($input){ print "\n\nOutput based on the file $input. Depth = $depth.\n\n"; %freq = create_freq_hash($input, $depth); } elsif($load){ eval{$freq = Storable::retrieve("$load") or die "Can't load frequency analysis data from $load: $!";}; if($@){print "ERROR: $load does not appear to be a valid frequency file\n"; exit;} ($freq_name, $freq_file, $freq_depth, %freq) = @$freq; $depth = $freq_depth; if($freq_name !~ /ngram\.plx/){ print "\n\nERROR: $load was not created by this script\n"; exit; } print "\n\nOutput based on the file $freq_file. Depth = $freq_depth.\n\n"; } ############################################## # If -s, save %freq to a file. ############################################## if($save){ @freq = ($name, $input, $depth, %freq); nstore(\@freq, "$save") or die "Can't store frequency analysis data to $save: $!"; } ######################################################## # If -a, store human readable version of frequency file ######################################################## if($ascii){ my($key, $value, $k, $v); open (DBG, ">$ascii"); while(($key, $value) = each %freq){ print DBG "\n\n#$key#\n"; while(($k, $v) = each %$value){ print DBG "#$k $v# "; } } close(DBG); } ################################## # Create and print the start text ################################## $start_text = create_start_text() unless $start_text; $ngram = substr($start_text, 1-$depth, $depth-1); open(OFH, ">$output") or die "Can't open $output for writing: $!" if $output && ! $append; open(OFH, ">>$output") or die "Can't open $output for appending: $!" if $output && $append; print "\n\n$start_text" unless $no_display; print OFH "$start_text" if $output; ############################ # Let's create some text! ############################ for my $i (length($start_text)+1 .. $verbosity-1){ $next_char = get_next_char($ngram); print "$next_char" unless $no_display; print OFH "$next_char" if $output; print OFH "\n" if($output && !($i % $wrap)); $ngram = substr($ngram, 1) . $next_char; } print "\n\n" unless $no_display; close(OFH); ############################################################################################ # SUBROUTINES # ############################################################################################ ################################################################################# # Get next character # # Given a $depth-1 length character string, and the frequency hash (%freq), this # routine will pick the next character in the output. It finds the value stored # in %freq that is keyed by $ngram. This value is another hash that has as its # keys all the possible letters that followed $ngram in the original text. The # value of each of these keys is the number of times that letter followed $ngram # in the original text. One of these letters is picked at random, with a higher # probability of picking a letter that appeared more often in the original text. # # There are several ways to do this (TMTOWTDI). This routine does it as follows: # Let's say the ngram is 'ceed'; and that 'ceed' is followed by 'e' 2 times, 's' # 1 time, and 'i' 5 times. An array with 8 elements (2+1+5) will be created. # 2 elements will contain 'e', 1 element will contain 's', and 5 elements will # contain 'i'. An element is picked at random to be returned. Thus, letters # that were more likely to follow the ngram in the original text will be more # likely to follow in the created text. ################################################################################# sub get_next_char { my($ngram) = @_; my $ptr = 0; my(@ary, $sub_hash); if(defined $freq{$ngram}){ $sub_hash = $freq{$ngram}; } else { print "\n\nERROR: The text you entered does not end with characters\n"; print "that are in the frequency hash.\n\n"; exit; } while(my($key, $value) = each %$sub_hash){ for($ptr..$value+$ptr-1){ $ary[$_] = $key; } $ptr += $value; } return $ary[rand($ptr)]; } ################################################################################# # Create start text # # This routine will first attempt to create an array, @keys, consisting of all # the first level keys of %freq that start with an uppercase letter followed # by a lowercase letter. This is so the output text will have a 'proper' start. # If this array is empty, meaning there are no capitalized words in the input # text, it will repopulate @keys with all the keys of %freq. In either case, # it will select one of the keys at random, to be used as the starting text # for the output. ################################################################################# sub create_start_text { my(@keys); for(keys %freq){ push(@keys, $_) if /^[A-Z][a-z]/; } @keys = keys %freq unless @keys; return $keys[rand(@keys)]; } ################################################################################# # Create frequency hash # # This routine opens an input file, and puts the entire contents into $text, # converting instances of more than one whitespace character in a row into one # whitespace character. It creates a hash with keys consisting of all the # ngrams (of length $depth-1) in the input text. The value will be another hash # with keys consisting of all the possible characters that follow that ngram in # the text. The value of each of those keys will be the number of times that # particular combination of ngram and following letter occur in the text. ################################################################################# sub create_freq_hash { my($input, $depth) = @_; my(@input, %freq, @text, $text, $ngram, $ptr, $next_char); # Make one long string, collapsing all multiple whitespace into one space open (IFH, "$input") or die "Can't open $input for reading: $!"; @text = ; $text = join('', @text); $text =~ s/\s+/ /g; close (IFH); $ngram = substr($text,0,$depth-1); $ptr = $depth-1; for(0..length($text)-$depth){ $next_char = substr($text,$ptr,1); $freq{"$ngram"}->{"$next_char"}++; $ngram = substr($ngram,1) . $next_char; $ptr++; } return %freq; } ################################################################################# # Get command line switches # # -i: Specifies the input file to run frequency analysis on. Mutually exclusive # with -l, so if -l is also specified, it will quit with an error message. # # -o: Specifies an output file to write results to. If switch is not present # results will not be written to a file. If -A is specified, results will # be appended to whatever the output file is. # # -d: Specifies the depth of analysis. In other words, if depth is '3', the # program will look at the last two letters output, and based on the # frequency analysis for that digram, pick a letter to follow. # # -v: Specifies the length of the output (verbosity). # # -s: Specifies a file in which to store the frequency analysis. Also stored # in the file will be the name and version of this script, the name of the # input file the analysis was based on, and the depth of the analysis. # Mutually exclusive with -l, so if -l is also specified, it will quit with # an error message. # # -l: Specifies a frequency file to load and use. Mutually exclusive with -i # and -s, so if either -i or -s is specified, it will quit with an error message. # # -t: Specifies starting text for the output. The script will start with the # last d-1 characters of the text (where d is the depth), and do its thing # from there. If -t is not specified, the script will start with a random # d-1 character sequence from the frequency hash. If you specify your own # text and it contains spaces, then the text must be enclosed in "". # # -a: Specifies the name of a file in which to store a human readable version # of the frequency analysis. This works just like the -s switch, except # the frequency data is human readable. An excerpt of a file follows: # # #ki# # #l 1# #n 20# # # #tr# # #y 3# #e 22# #u 17# #a 21# #i 18# #o 5# # # This would mean that: # # sequence is followed by this many times in the original text # ki l 1 # ki n 20 # tr y 3 # tr e 22 # tr u 17 # tr a 21 # tr i 18 # tr o 5 # (for $64,000, Name That Text! ;) # # -e: Specifies the name of a frequency file to examine. It will cause # the program to print out the following embedded information from the # frequency file: the name and version of this script, the name of the input # file the frequency analysis is based on, and the depth used in the # analysis. If this switch is present, all other switches are ignored. # # -D: If specified, will not display output on monitor. # # -A: Output file will be appended to, not overwritten. ################################################################################# sub get_options { my(%opts, $input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append); getopts('i:o:d:v:s:l:t:a:e:DA', \%opts); #### check for mutually exclusive cases and lack of both -i and -l if(defined $opts{i} && defined $opts{l}){ print "You may not specify both the -i and -l switches\n"; exit } if(! defined $opts{i} && ! defined $opts{l}){ print "You have to specify one of the -i or -l switches\n"; exit } if(defined $opts{s} && defined $opts{l}){ print "You may not specify both the -s and -l switches\n"; exit; } ########################### check for switches ################### if(defined $opts{e}){ $examine = $opts{e}; if(! -e $examine){ print "Frequency file $examine does not exist.\n"; exit; } return ($input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append); } $no_display = $opts{D} || 0; $append = $opts{A} || 0; if(defined $opts{i}){ $input = $opts{i}; do {print "Input file $input does not exist\n"; exit} unless -e $input; do {print "Input file $input does not appear to be a text file\n"; exit} unless -T "$input"; } if(defined $opts{l}){ $load = $opts{l}; if(! -e $load){ print "Frequency file $load does not exist.\n"; exit; } } if(defined $opts{s}){ $save = "$opts{s}"; if(-e $save){ print "$save already exists. Overwrite? n\b"; my $answer = ; chomp($answer); $answer = lc($answer); exit unless $answer eq "y"; } } if(defined $opts{o}){ $output = $opts{o}; if(-e $output && ! $append){ print "$output already exists. Overwrite? n\b"; my $answer = ; chomp($answer); $answer = lc($answer); exit unless $answer eq "y"; } } if(defined $opts{d}){ $depth = $opts{d}; if($depth < 2 || $depth > 10){ print "Depth must be from 2 to 10, inclusive\n"; exit; } } else { $depth = 3; } if(defined $opts{v}){ $verbosity = $opts{v}; if($verbosity < 10 || $verbosity > 10000){ print "Length must be from 10 to 10,000 characters, inclusive\n"; exit; } } else { $verbosity = 100; } if(defined $opts{t}){ $start_text = $opts{t}; if(length($start_text) < $depth){ print "\n\nERROR: The length of the text you supplied with the -t option must be\n"; print "at least $depth characters long, because that is what you set depth to.\n\n"; exit; } } if(defined $opts{a}){ $ascii = "$opts{a}"; if(-e $ascii){ print "$ascii already exists. Overwrite? n\b"; my $answer = ; chomp($answer); $answer = lc($answer); exit unless $answer eq "y"; } } return ($input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append); }