Beefy Boxes and Bandwidth Generously Provided by pair Networks
Syntactic Confectionery Delight
 
PerlMonks  

Statistics::SGT

by jdporter (Canon)
on Aug 16, 2004 at 03:13 UTC ( #383177=sourcecode: print w/ replies, xml ) Need Help??

Category: Math/Statistics
Author/Contact Info

jdporter


Description: Statistics::SGT - Simple Good-Turing Frequency Estimator
use Statistics::SGT;
The function Statistics::SGT::sgt takes a set of ( int:frequency, int:frequency_of_frequency ) pairs, and applies the Simple Good-Turing technique for estimating the probabilities corresponding to the observed frequencies, and P.0, the joint probability of all unobserved species.

The input should consist of a list of array-refs, each of which contains two positive integers: an observed frequency, and the frequency of that frequency.

No checks are made for linearity; the routine simply assumes that the requirements for using the SGT estimator are met.

The output is a list of ( int:frequency, [0,1.0]:estimated_probability ) pairs. The set of frequencies is the same as in the input array, plus the addition of a ( 0, P.0 ) element.

See also: http://www.grsampson.net/RGoodTur.html

=head1 NAME

Statistics::SGT - Simple Good-Turing Frequency Estimator

=head1 SYNOPSIS

  use Statistics::SGT;

=head1 DESCRIPTION

The function C<Statistics::SGT::sgt> takes a set of
I<S<( int:frequency, int:frequency_of_frequency )>> pairs, and
applies the Simple Good-Turing technique for estimating
the probabilities corresponding to the observed frequencies,
and P.0, the joint probability of all unobserved species.

The input should consist of a list of array-refs, each of which
contains two positive integers: an observed frequency, and the
frequency of that frequency.

No checks are made for linearity; the routine simply assumes that the
requirements for using the SGT estimator are met.

The output is a list of I<S<( int:frequency, [0,1.0]:estimated_probabi
+lity )>> pairs.
The set of frequencies is the same as in the input array,
plus the addition of a I<S<( 0, P.0 )>> element.

=head1 REFERENCES

This is a Perl adaptation of a C program originally written by:

B<Geoffrey Sampson,
School of Cognitive and Computing Sciences,
University of Sussex (England),
1995-06-27.>

The Simple Good-Turing technique was devised by William A. Gale of AT&
+T Bell Labs.
It is described in:

B<Gale, William A., and Geoffrey Sampson,
'Good-Turing Frequency Estimation Without Tears',
I<Journal of Quantitative Linguistics> 2.217-37, 1995.>

See also: http://www.grsampson.net/RGoodTur.html

=cut

package Statistics::SGT;

use Exporter;
use strict;
use vars qw( @ISA @EXPORT_OK $VERSION $MIN_INPUT $EPSILON );

@ISA = qw(Exporter);
@EXPORT_OK = qw( &sgt );

$VERSION = '0.01';

$MIN_INPUT = 5;

$EPSILON = 0.00000001; # use for n, in place of 0, to prevent divide-b
+y-zero.


{
  package SGT_::Smoother;

  sub slope { $_[0]{'slope'} }
  sub intercept { $_[0]{'intercept'} }

  sub smoothed { # double (int)
    my $self = shift;
    my $i = shift;
    exp( $self->intercept + $self->slope * log($i) );
  }

  sub new {
    my $pkg = shift;
    my $X_name = shift;
    my $Y_name = shift;
    my $ar = shift; # array ref (\@recs)
    ref($ar) && $ar =~ /\bARRAY\b/ or die "SGT_::Smoother::new($ar) --
+ arg not an arrayref!";
    @$ar > 0 or die "SGT_::Smoother::new($ar) -- array passed is empty
+!";

    my $meanX = 0.0;
    my $meanY = 0.0;
    for ( @$ar ) {
      $meanX += $_->{ $X_name };
      $meanY += $_->{ $Y_name };
    }
    $meanX /= scalar(@$ar);
    $meanY /= scalar(@$ar);

    my $XYs = 0.0;
    my $Xsquares = 0.0;
    for ( @$ar ) {
      $XYs += ($_->{ $X_name } - $meanX) * ($_->{ $Y_name } - $meanY);
      $Xsquares += square($_->{ $X_name } - $meanX);
    }
    $Xsquares != 0 or die "SGT_::Smoother::new: \$Xsquares is zero!";
    my $slope = $XYs / $Xsquares;
    bless {
      'slope'     => $slope,
      'intercept' => $meanY - $slope * $meanX,
    }, $pkg;
  }

  sub square($) { $_[0] ** 2.0 }

}


=head1 FUNCTIONS

=head2 sgt()

The input consists of a set of pairs of numbers ( r, n ).

Each pair can be in the form of an array, in which case the [0] elemen
+t is
taken for r, and the [1] element is taken for n.

Alternatively, it can take the form of a hash, in which case it must
contain an {'r'} element and an {'n'} element, which are used directly
+.

If called in a list context, this function returns a list of 'records'
+,
each of which is a hash, described below.  The array is ordered by
the [0] (r) element of each record.

If called in a scalar context, this function returns the records in
a hash, which is returned by reference.  This hash is keyed by the 'r'
element of each record.

If called in a void context, nothing is returned; this may be useful i
+f the
package variable $SET_OUTVALS is set to true prior to calling this fun
+ction.
If that variable is true, the values of the input array are modified i
+n the
following way:  if the value is an arrayref (r in [0] and n in [1]), t
+hen
the derived 'p' value is placed in position [2]; if the value is a has
+href,
then an element 'p' is added to the hash.

$SET_OUTVALS is false by default: the input array is not modified in a
+ny way.

The resulting records are hashrefs, each with the following elements:

   r
   n
   p       -- final derived result for this r

The records also contain some derived values only used internally to a
+lgorithm:

   log_r
   log_Z
   rStar
   j       -- the record's index in the record array.

This function throws an exception on any detected error.

=cut

#
#     in_*   *_AofA => [ [ ] ]
#    out_*   *_AofH => [ { } ]
#  inout_*   *_HofA => [ [ ] ]
#            *_HofH => { { } }
#
# rtn => ( AofA | AofH | HofA | HofH )
#

sub sgt {

  my %args = @_;

  my( $in_key, $out_key, $rtn_type ) = do {

    my @in_keys  = grep { /^(-?)in/i   } keys %args;
    my @out_keys = grep { /out_/i      } keys %args;
    my @rtn_keys = grep { /^(-?)rtn$/i } keys %args;

    my $rrrr = @in_keys ? 'many' : 'few';
    @in_keys == 1 or  die "Too $rrrr IN ARG specifiers! (must be exact
+ly 1)";
    @out_keys > 1 and die "Too many OUT ARG specifiers! (must be at mo
+st 1)";
    @rtn_keys > 1 and die "Too many RETURN  specifiers! (must be at mo
+st 1)";

    ( $in_keys[0], $out_keys[0], @rtn_keys ? $args{$rtn_keys[0]} : und
+ef );
  };


  my $in_arg_ref  = $args{$in_key};
  my $out_arg_ref = defined $out_key ? $args{$out_key} : undef ; # may
+ be undef

  # $rtn_type is a string of the form "AofH".

  ref($in_arg_ref) or die "IN ARG value must be either a array ref or 
+an hash ref!";

  if ( $out_arg_ref ) {
    ref($out_arg_ref) or die "OUT ARG value must be either a array ref
+ or an hash ref!";
  }


  # returns a vector of two single-character strings, e.g. ( 'a', 'h' 
+).
  my $extype = sub {
    my $kt = shift;
    defined $kt or return(' ',' '); # undef: no key.
    lc($kt) =~ /(?:^|_)([ah])of?([ah])$/ or die "Arg key type '$kt' do
+esn't end with _[AH]of[AH] !!!";
    ( $1, $2 )
  };

  my @in_arg_type  = $extype->( $in_key );
  my @out_arg_type = $extype->( $out_key );
  my @rtn_type     = $extype->( $rtn_type );

  my $in_arg_type  = join '', @in_arg_type;
  my $out_arg_type = join '', @out_arg_type;
     $rtn_type     = join '', @rtn_type;

  #
  # barf if the stupid user does something stupid like
  #   in_HofA => \@foo
  # or
  #   inout_AofH => \%foo
  #

  my %ul = qw( a ARRAY h HASH );

  $in_arg_ref =~ /\b$ul{$in_arg_type[0]}\b/ or die "Actual IN ARG valu
+e does not match specified type!
($ul{$in_arg_type[0]} specified, $in_arg_ref passed.";

  if ( $out_arg_ref ) {
    $out_arg_ref =~ /\b$ul{$out_arg_type[0]}\b/ or die "Actual OUT ARG
+ value does not match specified type!
($ul{$out_arg_type[0]} specified, $out_arg_ref passed.";
  }

  #
  # r and n are both supposed to be non-zero positive integers ("natur
+al numbers").
  #
  my %get_elem_rn = (
      'a' => sub {
        my $r = shift;
        @$r >= 2 or die "Too few elements in input element array";
        @$r;
      },

      'h' => sub {
        my $rec_hr = shift;
    my $default_r = shift;
    my $r = $rec_hr->{'r'};
        exists $rec_hr->{'n'} or die "Missing 'n' field in input eleme
+nt hash";
    if ( defined $r and defined $default_r ) {
      $r == $default_r or 
            die "'r' value in element ($r) != key of input hash ($defa
+ult_r) !!!";
    }
    unless ( defined $r ) { # what to do if 'r' not present
      if ( defined $default_r ) {
        $r = $default_r;
      }
      else {
        die "Missing 'r' field in input element hash";
      }
    }
        ( $r, $rec_hr->{'n'} );
      },
  );

  #
  # our working data structures.
  # %recs is the main thing; it is keyed by 'r' value.
  # @recs is essentially a perlish kludge for a doubly-linked list;
  #   these are the only operations it needs to support:
  #     prev/next; is_first/is_last;
  #   the elements of @recs are ordered by 'r' value.
  #
  my %recs;
  my @recs;

  #
  # if input is AofA, (r,n) is taken from [0,1]; both must be defined.
  # if input is AofH, (r,n) is taken from {'r','n'}; both must be defi
+ned.
  # if input is HofA, (r,n) is taken from [0,1], and [0] must equal th
+e hash key.
  # if input is HofH, (r,n) is taken from {'r','n'}, according to the 
+following rules:
  #    {'r'} may be missing, in which case (r) is taken from the hash 
+key;
  #    if {'r'} is present, it must equal the hash key.
  #
{
  my @keys = $in_arg_type[0] eq 'a' ? () : keys %$in_arg_ref;
  my @in_recs = $in_arg_type[0] eq 'a' ? @$in_arg_ref : @{$in_arg_ref}
+{@keys};


  for my $i ( 0 .. $#in_recs ) {

    my $el = $in_recs[$i];
    ref($el) or die "Input element not a reference! ($el)";
    $el =~ /\b$ul{$in_arg_type[1]}\b/ or die "Input element ref does n
+ot match specified type!
($ul{$in_arg_type[1]} specified, $in_arg_ref passed.";

    my( $r, $n ) = $get_elem_rn{$in_arg_type[1]}->($el, $keys[$i]); # 
+default r

    $recs{$r} = {
      'r' => $r,
      'n' => $n || $EPSILON,
      'log_r' => log($r),
    };
  }
}


  #
  # %recs has now been initialized from the input data.
  #

  exists $recs{1} or
    die "Error: no value '1' in input!";

  #@recs = sort { $a->{'r'} <=> $b->{'r'} } values %recs;
  @recs = @recs{ sort { $a <=> $b } keys %recs };

  @recs >= $MIN_INPUT or
    die "Not enough input values (\$Statistics\::SGT\::MIN_INPUT = $MI
+N_INPUT)";

  for my $j ( 0 .. $#recs ) {
    $recs[$j]{'j'} = $j;  # let records find themselves in the @recs a
+rray.
  }

  #
  # now we're done initializing our working structures (@recs, %recs) 
+from the input data;
  # commence to analyse the data:
  #

  for ( @recs ) {
    my $j = $_->{'j'};
    my $i = (
      $j == 0                   # first row?
        ? 0
        : $recs[$j - 1]{'r'}    # 'r' of previous row
    );

    my $k = (
      $j == $#recs              # last row?
        ? (2 * $_->{'r'} - $i)  # use the same delta as on the last st
+ep.
        : $recs[$j + 1]{'r'}    # 'r' of next row
    );

    ($k - $i) or die "Statistics::SGT::sgt(): divide by zero immanent!
+";
    $_->{'log_Z'} = log( 2 * $_->{'n'} / ($k - $i) );
  }

  my $smoother = new SGT_::Smoother 'log_r', 'log_Z', \@recs;

  my $indiffValsSeen = 0; # false
  for ( @recs ) {
    $smoother->smoothed($_->{'r'})
      or die "Statistics::SGT::sgt(): divide by zero immanent!";

    my $y = ($_->{'r'} + 1) * $smoother->smoothed($_->{'r'} + 1) / $sm
+oother->smoothed($_->{'r'});

    exists $recs{$_->{'r'} + 1} or
      $indiffValsSeen = 1; # true

    unless ( $indiffValsSeen ) {
      my $next_n = $recs{$_->{'r'} + 1}{'n'};
      my $x = ($_->{'r'} + 1) * $next_n / $_->{'n'}; # n=0 elements ha
+ve been removed from input.
      if (
        abs($x - $y)
      <=
        1.96 * sqrt(
            SGT_::Smoother::square($_->{'r'} + 1.0) *      $next_n
          /
            SGT_::Smoother::square($_->{'n'}      ) * (1 + $next_n / $
+_->{'n'})
        )
      ) {
        $indiffValsSeen = 1; # true
      }
      else {
        $_->{'rStar'} = $x;
      }
    }

    $indiffValsSeen and
      $_->{'rStar'} = $y;
  }


  my $bigN = 0; # int
  for ( @recs ) {
    $bigN += $_->{'r'} * $_->{'n'};
  }
  $bigN or die "divide-by-zero immanent! (bigN)";
  my $P_0 = $recs{1}{'n'} / $bigN; # double
  my $bigNprime = 0.0; # double
  for ( @recs ) {
    $bigNprime += $_->{'rStar'} * $_->{'n'};
  }
  $bigNprime or die "divide-by-zero immanent! (bigNprime)";
  for ( @recs ) {
    $_->{'p'} = (1 - $P_0) * $_->{'rStar'} / $bigNprime;
  }

#
# P_0 gets added to the output list, in position [0] and {0}.
# but first we check that there's nothing there already.
#
  exists $recs{0} and die "Already a P.0 element in the working data s
+et!";

  $recs{0} = { 'r' => 0, 'n' => 0, 'p' => $P_0 };
  unshift @recs, $recs{0};

  #
  # Now process the out param and/or return requests.
  #

  #
  # we already have AofH and HofH, suitable for returning.
  #

  # note that the default case is 'ah'... but '  ' also falls into it!
  # thus effectively making 'ah' the default return type.

  if ( $rtn_type eq 'ha' ) {
    return(
      wantarray
        ? ( map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs )
        : { map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs }
    );
  }
  elsif ( $rtn_type eq 'aa' ) {
    return(
      wantarray
        ? ( map { [ @{$_}{qw(r n p)} ] } @recs )
        : [ map { [ @{$_}{qw(r n p)} ] } @recs ]
    );
  }
  elsif ( $rtn_type eq 'hh' ) {
    return( wantarray ? %recs : \%recs );
  }
  else { # 'ah' and '  ' fall into this case:
    return( wantarray ? @recs : \@recs );
  }
}


=head1 AUTHOR, DATE, COPYRIGHT

Perl code by JDPORTER@cpan.org (John Porter), 2000-01-31.

This Perl module is free software, and may be modified and/or
redistributed under the same terms as Perl itself.

=cut

1;

Comment on Statistics::SGT
Download Code

Back to Code Catacombs

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: sourcecode [id://383177]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others romping around the Monastery: (9)
As of 2015-07-03 11:01 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (51 votes), past polls