#!/usr/local/bin/perl -w
##############################################################################################
#       ngram.plx
#       v 0.2
#       by Mike Phenicie (TheEnigma on PerlMonks - www.perlmonks.org)
#       09-18-04
##############################################################################################
#  Background:  An analysis can be made of text samples, in which the frequency of the various
#  ngrams is calulated.  An ngram is my term for character sequences of length n.  I have seen
#  2 character sequences refered to as both digrams and digraphs; and 3 character sequences
#  refered to as trigrams and trigraphs.  So to refer in a generic way to sequences of any
#  length, I chose ngram.
#
#  This program scans through a text file, and looks at every sequence of length n-1, noting
#  what character comes after it.  It will create a hash of hashes that contains the number
#  of occurances of a particular character following a particular n-1 length sequence.  In
#  other words, if there are 10 places in the text where 'te' is followed by 'r', then
#  $freq{'te'}->{'r'} == 10.
#
#  This frequency table can be used to create a new text that has similarities to the original
#  text file.  The new file will contain text with a very similar ratio of ngrams.  For instance,
#  if the original text had a lot of occasions where the letters 'ne' were followed by
#  an 's' (with n=3), but not many occasions where it was followed by a 'w', then that should
#  be true in the new text as well.
#
#  With low numbers of n (three or four, for instance), the only similarity might be that it
#  seems to be in the same language as the input.  With higher values for n, the output will 
#  start to resemble the input more and more.  With a high enough value it's possible you might
#  get the exact same text for output as was input.
##############################################################################################
#                                    Switches
#
#             -i <input text file>              file to analyze
#             -o <output text file>             computer generated text
#             -d <depth (2-10)>                 default 3
#             -v <verbosity> (1 to 10k)         default 100 (how long the output is)
#             -s <frequency file>               save frequency hash to a file
#             -l <frequency file>               use existing frequency file for input
#                                               (can't specify -i or -s with this)
#             -t <"starting text">              default is random by script
#             -a <human readable freq file>     a human readable frequency file
#             -e <frequency file>               examine - ignores all other switches
#             -D (no display on monitor)        if switch present, no display on monitor
#             -A (append)                       append to output file
#
#  Usage:  You must specify an -i or an -l (but not both) to tell the program what to use as input
#          You may specify an -s, but if you do, you can't use -l
#          You may specify an -o, and if you optionally also use -A, the file will be appended
#          The generated text will by default go to the monitor, turn this off with -D
#          You may specify -t to tell it what text you want to start with
#          You may specify -a to store the frequency data in a human readable file
#          You may specify an -e (which will ignore all other switches) to examine a freq file
#          If you don't specify -d, a default of 3 will be used
#          If you don't specify -v, a default of 100 will be used
###############################################################################################
#         to do
# add help
# make line breaks occur on spaces
# try mixing two or more frequency files to create one output
# prevent it from printing bad start text to the output file
#
# and...?
##########################################################################
use strict;
use Getopt::Std;
use Storable qw(nstore); 

$| = 1;

srand;
my $name = "ngram.plx v 0.2";
my $wrap = 80;
my(%freq, $ngram, $next_char, $freq_file, $freq_depth, $freq_name, $freq, @freq);

# Get command line options.
my($input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append) = get_options();

if($examine){
  $freq = Storable::retrieve("$examine") or die "Can't load frequency analysis data from $examine: $!";
  ($freq_name, $freq_file, $freq_depth, %freq) = @$freq;
  print "\n\nFrequency file $examine created by $freq_name\n";
  print "Based on text from $freq_file\n";
  print "Depth = $freq_depth\n\n";
  exit;
}

##################################################################
# Put frequency data in %freq, either by analyzing an input file,
# or reading a previously saved frequency file.
##################################################################
if($input){
  print "\n\nOutput based on the file $input.  Depth = $depth.\n\n";
  %freq = create_freq_hash($input, $depth);
} elsif($load){
  eval{$freq = Storable::retrieve("$load") or die "Can't load frequency analysis data from $load: $!";};
  if($@){print "ERROR: $load does not appear to be a valid frequency file\n"; exit;}
  ($freq_name, $freq_file, $freq_depth, %freq) = @$freq;
  $depth = $freq_depth;
  if($freq_name !~ /ngram\.plx/){
    print "\n\nERROR: $load was not created by this script\n";
    exit;
  }
  print "\n\nOutput based on the file $freq_file.  Depth = $freq_depth.\n\n";
}

##############################################
# If -s, save %freq to a file.
##############################################
if($save){
  @freq = ($name, $input, $depth, %freq);
  nstore(\@freq, "$save") or die "Can't store frequency analysis data to $save: $!";
}

########################################################
# If -a, store human readable version of frequency file
########################################################
if($ascii){
  my($key, $value, $k, $v);
  
  open (DBG, ">$ascii");
  while(($key, $value) = each %freq){
    print DBG "\n\n#$key#\n";
    while(($k, $v) = each %$value){
      print DBG "#$k $v#   ";
    }
  }
  close(DBG);
}

##################################
# Create and print the start text
##################################
$start_text = create_start_text() unless $start_text;
$ngram = substr($start_text, 1-$depth, $depth-1);

open(OFH, ">$output") or die "Can't open $output for writing: $!" if $output && ! $append;
open(OFH, ">>$output") or die "Can't open $output for appending: $!" if $output && $append;
print "\n\n$start_text" unless $no_display;
print OFH "$start_text" if $output;

############################
# Let's create some text!
############################
for my $i (length($start_text)+1 .. $verbosity-1){
  $next_char = get_next_char($ngram);
  print "$next_char" unless $no_display;
  print OFH "$next_char" if $output;
  print OFH "\n" if($output && !($i % $wrap));
  $ngram = substr($ngram, 1) . $next_char;
}

print "\n\n" unless $no_display;
close(OFH);

############################################################################################
#                                    SUBROUTINES                                           #
############################################################################################
#################################################################################
#                          Get next character
#
#  Given a $depth-1 length character string, and the frequency hash (%freq), this
#  routine will pick the next character in the output.  It finds the value stored
#  in %freq that is keyed by $ngram.  This value is another hash that has as its
#  keys all the possible letters that followed $ngram in the original text.  The
#  value of each of these keys is the number of times that letter followed $ngram
#  in the original text.  One of these letters is picked at random, with a higher
#  probability of picking a letter that appeared more often in the original text.
#
#  There are several ways to do this (TMTOWTDI). This routine does it as follows:
#  Let's say the ngram is 'ceed'; and that 'ceed' is followed by 'e' 2 times, 's'
#  1 time, and 'i' 5 times.  An array with 8 elements (2+1+5) will be created.
#  2 elements will contain 'e', 1 element will contain 's', and 5 elements will
#  contain 'i'.  An element is picked at random to be returned.  Thus, letters
#  that were more likely to follow the ngram in the original text will be more
#  likely to follow in the created text.
#################################################################################
sub get_next_char {
  my($ngram) = @_;
  
  my $ptr = 0;
  my(@ary, $sub_hash);
  
  if(defined $freq{$ngram}){
    $sub_hash = $freq{$ngram};
  } else {
    print "\n\nERROR:  The text you entered does not end with characters\n";
    print "that are in the frequency hash.\n\n";
    exit;
  }
  while(my($key, $value) = each %$sub_hash){
    for($ptr..$value+$ptr-1){
      $ary[$_] = $key;
    }
    $ptr += $value;
  }
  
  return $ary[rand($ptr)];
}

#################################################################################
#                           Create start text
#
#  This routine will first attempt to create an array, @keys, consisting of all
#  the first level keys of %freq that start with an uppercase letter followed 
#  by a lowercase letter.  This is so the output text will have a 'proper' start.
#  If this array is empty, meaning there are no capitalized words in the input
#  text, it will repopulate @keys with all the keys of %freq.  In either case,
#  it will select one of the keys at random, to be used as the starting text
#  for the output.
#################################################################################
sub create_start_text {
  my(@keys);
  
  for(keys %freq){
    push(@keys, $_) if /^[A-Z][a-z]/;
  }
  
  @keys = keys %freq unless @keys;
  return $keys[rand(@keys)];
}

#################################################################################
#                           Create frequency hash                                
#
#  This routine opens an input file, and puts the entire contents into $text,
#  converting instances of more than one whitespace character in a row into one
#  whitespace character.  It creates a hash with keys consisting of all the
#  ngrams (of length $depth-1) in the input text.  The value will be another hash
#  with keys consisting of all the possible characters that follow that ngram in
#  the text.  The value of each of those keys will be the number of times that
#  particular combination of ngram and following letter occur in the text.
#################################################################################
sub create_freq_hash {
  my($input, $depth) = @_;
  my(@input, %freq, @text, $text, $ngram, $ptr, $next_char);
  
  # Make one long string, collapsing all multiple whitespace into one space
  open (IFH, "$input") or die "Can't open $input for reading: $!";
  @text = <IFH>;
  $text = join('', @text);
  $text =~ s/\s+/ /g;
  close (IFH);
  
  $ngram = substr($text,0,$depth-1);
  $ptr = $depth-1;
  
  for(0..length($text)-$depth){
    $next_char = substr($text,$ptr,1);
    $freq{"$ngram"}->{"$next_char"}++;
    $ngram = substr($ngram,1) . $next_char;
    $ptr++;
  }
  
  return %freq;  
}

#################################################################################
#                           Get command line switches
#
#  -i: Specifies the input file to run frequency analysis on. Mutually exclusive
#      with -l, so if -l is also specified, it will quit with an error message.
#
#  -o: Specifies an output file to write results to.  If switch is not present
#      results will not be written to a file.  If -A is specified, results will
#      be appended to whatever the output file is.
#
#  -d: Specifies the depth of analysis.  In other words, if depth is '3', the
#      program will look at the last two letters output, and based on the
#      frequency analysis for that digram, pick a letter to follow.
#
#  -v: Specifies the length of the output (verbosity).
#
#  -s: Specifies a file in which to store the frequency analysis.  Also stored
#      in the file will be the name and version of this script, the name of the
#      input file the analysis was based on, and the depth of the analysis.            
#      Mutually exclusive with -l, so if -l is also specified, it will quit with
#      an error message.
#
#  -l: Specifies a frequency file to load and use.  Mutually exclusive with -i
#      and -s, so if either -i or -s is specified, it will quit with an error message.
#
#  -t: Specifies starting text for the output.  The script will start with the
#      last d-1 characters of the text (where d is the depth), and do its thing
#      from there.  If -t is not specified, the script will start with a random
#      d-1 character sequence from the frequency hash.  If you specify your own
#      text and it contains spaces, then the text must be enclosed in "".
#
#  -a: Specifies the name of a file in which to store a human readable version
#      of the frequency analysis.  This works just like the -s switch, except
#      the frequency data is human readable.  An excerpt of a file follows:
#   
#                  #ki#
#                  #l 1#   #n 20#   
#
#                  #tr#
#                  #y 3#   #e 22#   #u 17#   #a 21#   #i 18#   #o 5#
#
#      This would mean that:
#
#      sequence   is followed by    this many times in the original text
#         ki            l                     1
#         ki            n                    20
#         tr            y                     3
#         tr            e                    22
#         tr            u                    17
#         tr            a                    21
#         tr            i                    18
#         tr            o                     5
#                   (for $64,000, Name That Text! ;)
#
#  -e: Specifies the name of a frequency file to examine.  It will cause
#      the program to print out the following embedded information from the
#      frequency file: the name and version of this script, the name of the input
#      file the frequency analysis is based on, and the depth used in the
#      analysis.  If this switch is present, all other switches are ignored.
#
#  -D: If specified, will not display output on monitor.
#
#  -A: Output file will be appended to, not overwritten.
#################################################################################
sub get_options {
  my(%opts, $input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append);
 
  getopts('i:o:d:v:s:l:t:a:e:DA', \%opts);
  
  #### check for mutually exclusive cases and lack of both -i and -l
  if(defined $opts{i} && defined $opts{l}){
    print "You may not specify both the -i and -l switches\n";
    exit
  }
  
  if(! defined $opts{i} && ! defined $opts{l}){
    print "You have to specify one of the -i or -l switches\n";
    exit
  }
  
  if(defined $opts{s} && defined $opts{l}){
    print "You may not specify both the -s and -l switches\n";
    exit;
  }

  ###########################  check for switches  ###################
  
  if(defined $opts{e}){
    $examine = $opts{e};
    if(! -e $examine){
      print "Frequency file $examine does not exist.\n";
      exit;
    }
    return ($input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append);
  }
  
  $no_display = $opts{D} || 0;
  $append = $opts{A} || 0;
     
  if(defined $opts{i}){
    $input = $opts{i};
    do {print "Input file $input does not exist\n"; exit} unless -e $input;
    do {print "Input file $input does not appear to be a text file\n"; exit} unless -T "$input";
  }
  
  if(defined $opts{l}){
    $load = $opts{l};
    if(! -e $load){
      print "Frequency file $load does not exist.\n";
      exit;
    }
  }
  
  if(defined $opts{s}){
    $save = "$opts{s}";
    if(-e $save){
      print "$save already exists.  Overwrite? n\b";
      my $answer = <STDIN>;
      chomp($answer);
      $answer = lc($answer);
      exit unless $answer eq "y";
    }
  }

  if(defined $opts{o}){
    $output = $opts{o};
    if(-e $output && ! $append){
      print "$output already exists.  Overwrite? n\b";
      my $answer = <STDIN>;
      chomp($answer);
      $answer = lc($answer);
      exit unless $answer eq "y";
    }
  }
  
  if(defined $opts{d}){
    $depth = $opts{d};
    if($depth < 2 || $depth > 10){
      print "Depth must be from 2 to 10, inclusive\n";
      exit;
    }
  } else {
    $depth = 3;
  }
  
  if(defined $opts{v}){
    $verbosity = $opts{v};
    if($verbosity < 10 || $verbosity > 10000){
      print "Length must be from 10 to 10,000 characters, inclusive\n";
      exit;
    }
  } else {
    $verbosity = 100;
  }

  if(defined $opts{t}){
    $start_text = $opts{t};
    if(length($start_text) < $depth){
      print "\n\nERROR: The length of the text you supplied with the -t option must be\n";
      print "at least $depth characters long, because that is what you set depth to.\n\n";
      exit;
    }
  }
  
  if(defined $opts{a}){
    $ascii = "$opts{a}";
    if(-e $ascii){
      print "$ascii already exists.  Overwrite? n\b";
      my $answer = <STDIN>;
      chomp($answer);
      $answer = lc($answer);
      exit unless $answer eq "y";
    }
  }

  return ($input, $output, $depth, $verbosity, $save, $load, $start_text, $ascii, $examine, $no_display, $append);
}