=head1 NAME Statistics::SGT - Simple Good-Turing Frequency Estimator =head1 SYNOPSIS use Statistics::SGT; =head1 DESCRIPTION The function C takes a set of I> pairs, and applies the Simple Good-Turing technique for estimating the probabilities corresponding to the observed frequencies, and P.0, the joint probability of all unobserved species. The input should consist of a list of array-refs, each of which contains two positive integers: an observed frequency, and the frequency of that frequency. No checks are made for linearity; the routine simply assumes that the requirements for using the SGT estimator are met. The output is a list of I> pairs. The set of frequencies is the same as in the input array, plus the addition of a I> element. =head1 REFERENCES This is a Perl adaptation of a C program originally written by: B The Simple Good-Turing technique was devised by William A. Gale of AT&T Bell Labs. It is described in: B 2.217-37, 1995.> See also: http://www.grsampson.net/RGoodTur.html =cut package Statistics::SGT; use Exporter; use strict; use vars qw( @ISA @EXPORT_OK $VERSION $MIN_INPUT $EPSILON ); @ISA = qw(Exporter); @EXPORT_OK = qw( &sgt ); $VERSION = '0.01'; $MIN_INPUT = 5; $EPSILON = 0.00000001; # use for n, in place of 0, to prevent divide-by-zero. { package SGT_::Smoother; sub slope { $_[0]{'slope'} } sub intercept { $_[0]{'intercept'} } sub smoothed { # double (int) my $self = shift; my $i = shift; exp( $self->intercept + $self->slope * log($i) ); } sub new { my $pkg = shift; my $X_name = shift; my $Y_name = shift; my $ar = shift; # array ref (\@recs) ref($ar) && $ar =~ /\bARRAY\b/ or die "SGT_::Smoother::new($ar) -- arg not an arrayref!"; @$ar > 0 or die "SGT_::Smoother::new($ar) -- array passed is empty!"; my $meanX = 0.0; my $meanY = 0.0; for ( @$ar ) { $meanX += $_->{ $X_name }; $meanY += $_->{ $Y_name }; } $meanX /= scalar(@$ar); $meanY /= scalar(@$ar); my $XYs = 0.0; my $Xsquares = 0.0; for ( @$ar ) { $XYs += ($_->{ $X_name } - $meanX) * ($_->{ $Y_name } - $meanY); $Xsquares += square($_->{ $X_name } - $meanX); } $Xsquares != 0 or die "SGT_::Smoother::new: \$Xsquares is zero!"; my $slope = $XYs / $Xsquares; bless { 'slope' => $slope, 'intercept' => $meanY - $slope * $meanX, }, $pkg; } sub square($) { $_[0] ** 2.0 } } =head1 FUNCTIONS =head2 sgt() The input consists of a set of pairs of numbers ( r, n ). Each pair can be in the form of an array, in which case the [0] element is taken for r, and the [1] element is taken for n. Alternatively, it can take the form of a hash, in which case it must contain an {'r'} element and an {'n'} element, which are used directly. If called in a list context, this function returns a list of 'records', each of which is a hash, described below. The array is ordered by the [0] (r) element of each record. If called in a scalar context, this function returns the records in a hash, which is returned by reference. This hash is keyed by the 'r' element of each record. If called in a void context, nothing is returned; this may be useful if the package variable $SET_OUTVALS is set to true prior to calling this function. If that variable is true, the values of the input array are modified in the following way: if the value is an arrayref (r in [0] and n in [1]), then the derived 'p' value is placed in position [2]; if the value is a hashref, then an element 'p' is added to the hash. $SET_OUTVALS is false by default: the input array is not modified in any way. The resulting records are hashrefs, each with the following elements: r n p -- final derived result for this r The records also contain some derived values only used internally to algorithm: log_r log_Z rStar j -- the record's index in the record array. This function throws an exception on any detected error. =cut # # in_* *_AofA => [ [ ] ] # out_* *_AofH => [ { } ] # inout_* *_HofA => [ [ ] ] # *_HofH => { { } } # # rtn => ( AofA | AofH | HofA | HofH ) # sub sgt { my %args = @_; my( $in_key, $out_key, $rtn_type ) = do { my @in_keys = grep { /^(-?)in/i } keys %args; my @out_keys = grep { /out_/i } keys %args; my @rtn_keys = grep { /^(-?)rtn$/i } keys %args; my $rrrr = @in_keys ? 'many' : 'few'; @in_keys == 1 or die "Too $rrrr IN ARG specifiers! (must be exactly 1)"; @out_keys > 1 and die "Too many OUT ARG specifiers! (must be at most 1)"; @rtn_keys > 1 and die "Too many RETURN specifiers! (must be at most 1)"; ( $in_keys[0], $out_keys[0], @rtn_keys ? $args{$rtn_keys[0]} : undef ); }; my $in_arg_ref = $args{$in_key}; my $out_arg_ref = defined $out_key ? $args{$out_key} : undef ; # may be undef # $rtn_type is a string of the form "AofH". ref($in_arg_ref) or die "IN ARG value must be either a array ref or an hash ref!"; if ( $out_arg_ref ) { ref($out_arg_ref) or die "OUT ARG value must be either a array ref or an hash ref!"; } # returns a vector of two single-character strings, e.g. ( 'a', 'h' ). my $extype = sub { my $kt = shift; defined $kt or return(' ',' '); # undef: no key. lc($kt) =~ /(?:^|_)([ah])of?([ah])$/ or die "Arg key type '$kt' doesn't end with _[AH]of[AH] !!!"; ( $1, $2 ) }; my @in_arg_type = $extype->( $in_key ); my @out_arg_type = $extype->( $out_key ); my @rtn_type = $extype->( $rtn_type ); my $in_arg_type = join '', @in_arg_type; my $out_arg_type = join '', @out_arg_type; $rtn_type = join '', @rtn_type; # # barf if the stupid user does something stupid like # in_HofA => \@foo # or # inout_AofH => \%foo # my %ul = qw( a ARRAY h HASH ); $in_arg_ref =~ /\b$ul{$in_arg_type[0]}\b/ or die "Actual IN ARG value does not match specified type! ($ul{$in_arg_type[0]} specified, $in_arg_ref passed."; if ( $out_arg_ref ) { $out_arg_ref =~ /\b$ul{$out_arg_type[0]}\b/ or die "Actual OUT ARG value does not match specified type! ($ul{$out_arg_type[0]} specified, $out_arg_ref passed."; } # # r and n are both supposed to be non-zero positive integers ("natural numbers"). # my %get_elem_rn = ( 'a' => sub { my $r = shift; @$r >= 2 or die "Too few elements in input element array"; @$r; }, 'h' => sub { my $rec_hr = shift; my $default_r = shift; my $r = $rec_hr->{'r'}; exists $rec_hr->{'n'} or die "Missing 'n' field in input element hash"; if ( defined $r and defined $default_r ) { $r == $default_r or die "'r' value in element ($r) != key of input hash ($default_r) !!!"; } unless ( defined $r ) { # what to do if 'r' not present if ( defined $default_r ) { $r = $default_r; } else { die "Missing 'r' field in input element hash"; } } ( $r, $rec_hr->{'n'} ); }, ); # # our working data structures. # %recs is the main thing; it is keyed by 'r' value. # @recs is essentially a perlish kludge for a doubly-linked list; # these are the only operations it needs to support: # prev/next; is_first/is_last; # the elements of @recs are ordered by 'r' value. # my %recs; my @recs; # # if input is AofA, (r,n) is taken from [0,1]; both must be defined. # if input is AofH, (r,n) is taken from {'r','n'}; both must be defined. # if input is HofA, (r,n) is taken from [0,1], and [0] must equal the hash key. # if input is HofH, (r,n) is taken from {'r','n'}, according to the following rules: # {'r'} may be missing, in which case (r) is taken from the hash key; # if {'r'} is present, it must equal the hash key. # { my @keys = $in_arg_type[0] eq 'a' ? () : keys %$in_arg_ref; my @in_recs = $in_arg_type[0] eq 'a' ? @$in_arg_ref : @{$in_arg_ref}{@keys}; for my $i ( 0 .. $#in_recs ) { my $el = $in_recs[$i]; ref($el) or die "Input element not a reference! ($el)"; $el =~ /\b$ul{$in_arg_type[1]}\b/ or die "Input element ref does not match specified type! ($ul{$in_arg_type[1]} specified, $in_arg_ref passed."; my( $r, $n ) = $get_elem_rn{$in_arg_type[1]}->($el, $keys[$i]); # default r $recs{$r} = { 'r' => $r, 'n' => $n || $EPSILON, 'log_r' => log($r), }; } } # # %recs has now been initialized from the input data. # exists $recs{1} or die "Error: no value '1' in input!"; #@recs = sort { $a->{'r'} <=> $b->{'r'} } values %recs; @recs = @recs{ sort { $a <=> $b } keys %recs }; @recs >= $MIN_INPUT or die "Not enough input values (\$Statistics\::SGT\::MIN_INPUT = $MIN_INPUT)"; for my $j ( 0 .. $#recs ) { $recs[$j]{'j'} = $j; # let records find themselves in the @recs array. } # # now we're done initializing our working structures (@recs, %recs) from the input data; # commence to analyse the data: # for ( @recs ) { my $j = $_->{'j'}; my $i = ( $j == 0 # first row? ? 0 : $recs[$j - 1]{'r'} # 'r' of previous row ); my $k = ( $j == $#recs # last row? ? (2 * $_->{'r'} - $i) # use the same delta as on the last step. : $recs[$j + 1]{'r'} # 'r' of next row ); ($k - $i) or die "Statistics::SGT::sgt(): divide by zero immanent!"; $_->{'log_Z'} = log( 2 * $_->{'n'} / ($k - $i) ); } my $smoother = new SGT_::Smoother 'log_r', 'log_Z', \@recs; my $indiffValsSeen = 0; # false for ( @recs ) { $smoother->smoothed($_->{'r'}) or die "Statistics::SGT::sgt(): divide by zero immanent!"; my $y = ($_->{'r'} + 1) * $smoother->smoothed($_->{'r'} + 1) / $smoother->smoothed($_->{'r'}); exists $recs{$_->{'r'} + 1} or $indiffValsSeen = 1; # true unless ( $indiffValsSeen ) { my $next_n = $recs{$_->{'r'} + 1}{'n'}; my $x = ($_->{'r'} + 1) * $next_n / $_->{'n'}; # n=0 elements have been removed from input. if ( abs($x - $y) <= 1.96 * sqrt( SGT_::Smoother::square($_->{'r'} + 1.0) * $next_n / SGT_::Smoother::square($_->{'n'} ) * (1 + $next_n / $_->{'n'}) ) ) { $indiffValsSeen = 1; # true } else { $_->{'rStar'} = $x; } } $indiffValsSeen and $_->{'rStar'} = $y; } my $bigN = 0; # int for ( @recs ) { $bigN += $_->{'r'} * $_->{'n'}; } $bigN or die "divide-by-zero immanent! (bigN)"; my $P_0 = $recs{1}{'n'} / $bigN; # double my $bigNprime = 0.0; # double for ( @recs ) { $bigNprime += $_->{'rStar'} * $_->{'n'}; } $bigNprime or die "divide-by-zero immanent! (bigNprime)"; for ( @recs ) { $_->{'p'} = (1 - $P_0) * $_->{'rStar'} / $bigNprime; } # # P_0 gets added to the output list, in position [0] and {0}. # but first we check that there's nothing there already. # exists $recs{0} and die "Already a P.0 element in the working data set!"; $recs{0} = { 'r' => 0, 'n' => 0, 'p' => $P_0 }; unshift @recs, $recs{0}; # # Now process the out param and/or return requests. # # # we already have AofH and HofH, suitable for returning. # # note that the default case is 'ah'... but ' ' also falls into it! # thus effectively making 'ah' the default return type. if ( $rtn_type eq 'ha' ) { return( wantarray ? ( map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs ) : { map { $_ => [ @{$recs{$_}}{qw(r n p)} ] } keys %recs } ); } elsif ( $rtn_type eq 'aa' ) { return( wantarray ? ( map { [ @{$_}{qw(r n p)} ] } @recs ) : [ map { [ @{$_}{qw(r n p)} ] } @recs ] ); } elsif ( $rtn_type eq 'hh' ) { return( wantarray ? %recs : \%recs ); } else { # 'ah' and ' ' fall into this case: return( wantarray ? @recs : \@recs ); } } =head1 AUTHOR, DATE, COPYRIGHT Perl code by JDPORTER@cpan.org (John Porter), 2000-01-31. This Perl module is free software, and may be modified and/or redistributed under the same terms as Perl itself. =cut 1;