#!/usr/bin/perl -w =head1 NAME Gstat - calculate G-statistic for tabular data =head1 SYNOPSIS use Gstat; $gt = Gstat->new($datafile); $degreesOfFreedom = $gt->getDF(); $gstat = $gt->getG(); $gt->setExpected($expectedvalues); $uncorrectedG = $gt->getRawG(); =head1 DESCRIPTION C is a class that calculates the G-statistic for goodness of fit for frequency data. It can be used on simple frequency distributions (1-way tables) or for analyses of independence (2-way tables). Note that C will B, by itself, perform the significance test for you -- it just provides the G-statistic that can then be compared with the chi-square distribution to determine significance. =head1 OVERVIEW and EXAMPLES A goodness of fit test attempts to determine if an observed frequency distribution differs significantly from a hypothesized frequency distribution. From C's point of view, these tests come in two flavors: 1-way tests (where a single frequency distribution is tested against an expected distribution) and 2-way tests (where a matrix of observed values is tested for independence -- that is, the lack of interaction effects among the two axes being measured). A simple example might help here. You've grown 160 plants from seed produced by a single parent plant. You observe that among the offspring plants, some have spiny leaves, some have hairy leaves, and some have smooth leaves. What is the likelihood that the distribution of this trait follows the expected values for simple Mendelian inheritance? Observed values: Spiny Hairy Smooth 95 53 12 Expected values (for a 9:3:3:1 ratio): 90 60 10 If the observed and expected values are put into two files, C can create a G-statistic object that will calculate the likelihood that the observed distribution is significantly different from the distribution that would be expected by simple inheritance. (The value of G for this comparison is approximately 1.495, with 2 degrees of freedom; the observed results are not significantly different from expected at the .05 -- or even .1 level.) 2-way tests will usually not need a table of expected values, as the expected values are generated from the observed value sums. However, one can be loaded for 2-way tables as well. To determine if the calculated G statistic indicates a statistically significant result, you will need to look up the values in a chi-square distribution on your own, or make use of the C module: use Gstat; use Statistics::Distributions; ... my $gt = Gstat->new($data); my $df = $gt->getDF(); my $g = $gt->getG(); my $sig = '.05'; my $chis=Statistics::Distributions::chisqrdistr ($df,$sv); if ($g > $chis) { print "$g: Sig. at the $sv level. ($chis cutoff)\n" } By default, C returns a G statistic that has been modified by William's correction (Williams 1976). This correction reduces the value of G for smaller sample sizes, and has progressively less effect as the sample size increases. The raw, uncorrected G statistic is also available. Calculation methods based on Sokal, R.R., and F.J. Rohlf, Biometry. 1981. W.H. Freeman and Company, San Francisco. Williams, D.A. 1976. Improved likelihood ratio test for complete contingency tables. Biometrika, 63:33 - 37. =cut package Gstat; use strict; use vars qw($VERSION); $VERSION = '0.01'; my $self; =head1 Public Methods =head2 Constructor $g = Gstat->new($datafile); $g = new Gstat($datafile); I is a file containing whitespace-delimited data counts, arranged into rows and columns. There must be B non-numeric characters, B empty cells, and B zero counts. =cut sub new { my $type = shift; my $basefile = shift; if (! $basefile) { _error('1', ["Must pass in a filename for a file containing the data.", "Ex: \$g = new Gstat(\$datafile);"]); } $self = {}; _initialize($basefile); bless($self, $type); $self; } =head2 getG $float = $g->getG(); Returns the corrected G-statistic for the current observed and expected frequency counts. =cut sub getG { my $self = shift; $self->{'logsum'} = _sumCellOp(); $self->{'G'} = getRawG($self) / _williamsC(); $self->{'G'}; } =head2 getRawG $float = $g->getRawG(); Returns the uncorrected G-statistic for the current observed and expected frequency counts. This value can be misleadingly large for small sample sizes (n < 200). =cut sub getRawG { my $self = shift; $self->{'logsum'} = _sumCellOp(); $self->{'G'} = 2 * $self->{'logsum'}; 2 * $self->{'logsum'}; } =head2 setExpected $g->setExpected($expectedValues); If testing with a specific hypothesized distribution, the expected frequency values for that distribution, given the total sample size, must be input to C. The expected frequencies must be in a file with the same format constraints as the initial I. =cut sub setExpected { my $self = shift; my $expFile = shift; my @expect; if (! open(EXP, "$expFile")) { _error('2', ["System error: cannot open '$expFile' for reading: $!."]); } while () { s/^\s*//; s/\s*$//; if ($_) { my @row = split(/\s+/); foreach my $cell (@row) { _checkNumValidity($cell); push(@expect, $cell); } } } close EXP; $self->{'expected'} = \@expect; } =head2 getDF $integer = $g->getDF(); Returns the current degrees of freedom for this distribution, which is calculated automatically from the observed data (I - 1 for 1-way tests, (I - 1) * (I - 1) for 2-way tests). =cut sub getDF { my $self = shift; $self->{'df'}; } =head2 setDF $g->setDF($integer); Sets the degrees of freedom for this distribution. Sometimes this value needs to be modified beyond the standard rules used by C; C makes this possible. =cut sub setDF { my $self = shift; $self->{'df'} = shift; } =head2 getRow $rowref = $g->getRow(rownum); Returns a row from the array of observed data. Row numbering is zero-based. =cut sub getRow { my $self = shift; my $idx = shift; my @row = (); my $skip = $self->{'ncols'}; my $offset = ($idx + 1) * $skip; my $final = $offset + $skip; for my $cell ($offset .. $final - 1) { push(@row, $self->{'observed'}->[$cell]); } \@row; } =head2 getCol $colref = $g->getCol(colnum); Returns a column for the array of observed data. Column numbering is zero-based. =cut sub getCol { my $self = shift; my $idx = shift; my @col = (); my $skip = $self->{'ncols'}; my $max = $skip * $self->{'nrows'}; for (my $cell = $idx+1; $cell < $max; $cell += $skip) { push(@col,$self->{'observed'}->[$cell]); } \@col; } =head2 rowSum $integer = $g->rowSum($index); Returns the sum of the requested row. =cut sub rowSum { my $self = shift; my $idx = shift; $self->{'rowsums'}->[$idx]; } =head2 colSum $integer = $g->colSum($index); Returns the sum of the requested column. =cut sub colSum { my $self = shift; my $idx = shift; $self->{'colsums'}->[$idx]; } =head2 getRowNum $integer = $g->getRowNum(); Returns the number of rows in the data table. =cut sub getRowNum { my $self = shift; $self->{'nrows'}; } =head2 getColNum $integer = $g->getColNum(); Returns the number of columns in the data table. =cut sub getColNum { my $self = shift; $self->{'ncols'}; } =head2 getSumTotal $integer = $g->getSumTotal(); Returns the total number of observations. =cut sub getSumTotal { my $self = shift; $self->{'sumtotal'}; } # private methods sub _initialize { my $basefile = shift; if (! open(IN, "$basefile")) { _error('2', ["System error: cannot open '$basefile' for reading: $!."]); } my @datastruct = (); my @rowsums = (); my @sumcols = (); my ($ncols, $nrows, $checkcol, $sumtotal, @row); $self->{'basefile'} = $basefile; $self->{'colsums'} = (); while () { s/^\s*//; s/\s*$//; if ($_) { my @row = split(/\s+/); $ncols = scalar @row; if ($nrows) { if ($ncols != $checkcol) { _error(4, ["Data Table error: missing data in table."]); } } my $i = 0; foreach my $cell (@row) { push(@datastruct, $cell); $sumtotal += $cell; $self->{'colsums'}->[$i] += $cell; $i++; } $nrows++; push(@rowsums, _rowsum(\@row)); } $checkcol = $ncols; } close IN; _checkData(\@datastruct); $self->{'observed'} = \@datastruct; $self->{'rowsums'} = \@rowsums; $self->{'nrows'} = $nrows; $self->{'ncols'} = $ncols; $self->{'sumtotal'} = $sumtotal; # tabletype: 0 = 2-way, 1 = single column, 2 = single row if (($ncols == 1) || ($nrows == 1)) { if ($ncols == 1) { $self->{'df'} = $nrows - 1; $self->{'tabletype'} = 1; } if ($nrows == 1) { $self->{'df'} = $ncols - 1; $self->{'tabletype'} = 2; } } else { $self->{'df'} = ($nrows - 1) * ($ncols - 1); $self->{'tabletype'} = 0; } $self->{'expected'}= _expected(\@datastruct); } sub _williamsC { my $sumtotal = $self->{'sumtotal'}; my $rowsums = $self->{'rowsums'}; my $colsums = $self->{'colsums'}; if ($self->{'tabletype'} == 1) { $self->{'q'} = 1 + ($self->{'nrows'} + 1) / (6 * $sumtotal); } elsif ($self->{'tabletype'} == 2) { $self->{'q'} = 1 + ($self->{'ncols'} + 1) / (6 * $sumtotal); } else { my $recipRows = 0; my $recipCols = 0; foreach my $rval (@{$rowsums}) { $recipRows += (1 / $rval); } foreach my $cval (@{$colsums}) { $recipCols += (1 / $cval); } my $denom = 6 * $sumtotal * $self->{'df'}; my $num = ($sumtotal * $recipRows - 1) * ($sumtotal * $recipCols - 1); $self->{'q'} = 1 + $num / $denom; } } sub _expected { my $data = shift; my @expected = (); my $rows = $self->{'nrows'}; my $cols = $self->{'ncols'}; my $sumtotal = $self->{'sumtotal'}; for my $row (0 .. $rows-1) { my $marginalRow = $self->{'rowsums'}->[$row] / $sumtotal; my @a; for my $col (0 .. $cols-1) { my $marginalCol = $self->{'colsums'}->[$col] / $sumtotal; push(@expected, ($marginalRow * $marginalCol * $sumtotal)); } } \@expected; } sub _sumCellOp { my $logsum = 0; my $ncells = $self->{'nrows'} * $self->{'ncols'}; my $o = $self->{'observed'}; my $e = $self->{'expected'}; for my $cell (0 .. $ncells-1) { $logsum += $o->[$cell] * log($o->[$cell] / $e->[$cell]); } $logsum; } sub _rowsum { my $row = shift; my $total = 0; foreach my $val (@{$row}) { $total += $val; } return $total; } sub _checkData { my $data = shift; foreach my $val (@{$data}) { _checkNumValidity($val); } } sub _checkNumValidity { my $value = shift; my $eString = "Invalid data."; if ($value !~ /^-?\d+\.?\d*$/) { _error(3, ["$eString Value '$value' is not a number."]); } if ($value == 0) { _error(4, ["$eString Found a zero frequency value. Can't continue."]); } } sub _error { my ($code, $message) = @_; my ($package, $filename, $line) = caller(1); print STDERR " Error: in $package: file $filename, line $line\n\n"; print STDERR " Error: ", join("\n ",@{$message}), "\n\n"; print STDERR "This is the stack trace:\n"; exit $code; } 1;