Hello All,
I have a 2d array that has all the values filled in correctly which was checked with Data::Dumper. I then used map to create a slice to make an array of just a column. Then, I pass this array to another function to add up the counts for the column and return the total.
The error occurs when trying to add all the values in each column. I have the "total" on the right hand side of each print. As you can see, the columns with only 0's some how add up to 60 and yet the column with actual values that should be well over 300, is only at 36. Does anyone have any ideas how to solve this?
Thanks for the help,
-Hans
Problem Code expected in the score_count function on line 135
#!/usr/bin/perl -w
use strict;
use POSIX;
use Getopt::Std;
use Data::Dumper;
my $usage = "\nusage: $0 [-flags] <fasta file> <output file> (must be
+all 1 type. ie all fasta NT, all fastq, all fasta fna)\n".
"The script is able to detect if fasta or fastq files have
+ been suplied. Depending on the flags, it produces different output.\
+n".
"It will produce a histogram, a vector distribution, and a
+ NT/AA distribution. For fastq files, you must use a flag to specify
+which version\n".
"of fastq the file is formatted with.\n\n".
"-l # Y or N to specify Solexa/Illumina < 1.3 Fas
+tq: Defaults N\n".
"-s # Y or N to specify Sanger Fastq: Defaults N\
+n".
"-u # Y or N to specify Illumina >= 1.3 Fastq: De
+faults N\n";
our($opt_l, $opt_s, $opt_u, $opt_b); # histogram interval
getopts('l:s:u:b:') or die $usage;
if (!defined($opt_l)) {$opt_l = 'N';}
if (!defined($opt_s)) {$opt_s = 'N';}
if (!defined($opt_u)) {$opt_u = 'N';}
if (!defined($opt_b)) {$opt_u = 'N';}
if( ( $#ARGV + 1 ) < 1 ) {
die $usage;
}
my $infile = shift or die $usage;
my $outfile = shift or die $usage;
open OUT, ">$outfile" or die $usage;
open STDERR, ">/dev/null";
# Read in sequences from one or more fasta files
my $Id;
my $nextline;
my $qual;
my @phred_means;
my @phred1scr;
my $numseqs = 0;
my @total;
my @count;
my @min;
my @max;
my @stddev;
my @matrix;
my @Q1, my @Q2;
for(my $j = 0; $j < 70; $j++) {
$total[$j] = 0;
$count[$j] = 0;
$min[$j] = 100;
$max[$j] = 0;
}
for(my $i = 0; $i < 40; $i++) {
for(my $j = 0; $j < 60; $j++) {
$matrix[$i][$j] = 0;
}
}
#automatic-1fasta or fastq detection
open(FASTA, $infile) or die "Can't open file $infile\n";
while (<FASTA>) {
##if Fastq format
if(/^@(.+)\s/) {
$Id = $1;
my $nextline = <FASTA>;
if($nextline =~ /^(\S+)$/) {
my $nextline = <FASTA>;
if($nextline =~ /^\+(.*)\s/) {
my $nextline = <FASTA>;
if($nextline =~ /^(\S+)$/) {
$qual = $1;
$numseqs += 1;
##returns an array with quality raw scores
@phred1scr = fastq_processing($qual);
my $i = 0;
##adds each returned array score to the correct $total[position]
foreach my $score (@phred1scr) {
$score = round($score);
$total[$i] += $score;
$count[$i] += 1;
if($score > $max[$i]) { $max[$i] = $score; }
if($score < $min[$i]) { $min[$i] = $score; }
if($score >= 0 and $score <= 40) { $matrix[$score][$i]++; }
$i++;
}
}
}
}
}
}
close (FASTA);
##calculates the means
for my $j (0..69) {
last if($count[$j] == 0);
my $avg = $total[$j] / $count[$j];
push (@phred_means, $avg);
}
#print Dumper(\@matrix);
##calculate the stddevs and quartiles
#if(@val60) {
# my $total2 = 0;
# foreach my $num (@val60) {
# $total2 += ($phred_means[59]-$num)**2;
# }
# my $mean2 = $total2 / (scalar @val60);
# my $std_dev = sqrt($mean2);
# push(@stddev, $std_dev);
#}
fastq_dist(\@phred_means);
print OUT "\n\nScore\t\tCount\n";
#my @col = get_col(\@matrix, 38);
#print Dumper(\@col);
for my $i (0..$#matrix) {
my @col = map { $_->[$i] } \@matrix;
print Dumper(\@col);
my $score = score_count(@col);
# print OUT "$i\t\t$score\n";
}
###########################################
##### Sub Processing Routines follow ######
###########################################
sub round {
my($number) = shift;
return int($number + .5);
}
sub score_count {
my @arr = shift;
my $total = 0;
foreach my $score (@arr) {
print "@$score ";
$total += @$score;
}
print "$total\n";
return $total;
}
sub fastq_processing {
my $raw_qual = shift;
my $i = 1;
my @phred1seq;
my @bases = split(//, $raw_qual);
#different phred score calculations depending on fastq version
foreach my $byte (@bases) {
if($opt_l eq 'Y') {
my $phred = (10 * log(1 + 10 ** ((ord($byte) - 64) / 10.0)) / lo
+g(10));
push (@phred1seq, $phred);
}
if($opt_s eq 'Y') {
my $phred = ord($byte) - 33;
push (@phred1seq, $phred);
}
if($opt_u eq 'Y') {
my $phred = ord($byte) - 64;
push (@phred1seq, $phred);
}
}
return @phred1seq;
}
sub fastq_dist {
my @means = shift;
my $i;
my $text;
if($opt_l eq 'Y') {
$text = "Solexa/Illumina < 1.3 scoring";
} elsif ($opt_s eq 'Y') {
$text = "Sanger scoring";
} elsif ($opt_u eq 'Y') {
$text = "Illumina >= 1.3 scoring";
}
print OUT "Base\nPos\tCount\tMin\tMax\tStd Dev\tMean Phred\tHistogra
+m - $text\n";
my $lrgstpos = 0;
my $mean_arr = $means[0];
foreach my $mean (@$mean_arr) {
##get's the number of phred values per sequence
if($mean > 0) {
$lrgstpos += 1;
}
}
#loops through the total number of postions and gets the mean for
+each
for my $i (0 .. ($lrgstpos - 1)) {
my $mean = $$mean_arr[$i];
my $pos = $i + 1;
print OUT "$pos\t$count[$i]\t";
# printf OUT "%.2f\t%.2f\t%.2f\t%.2f\t", $min[$i], $max[$i], $std
+dev[$i], $mean;
my $histogram = "|" x $mean;
print OUT $histogram."\n";
}
print OUT "\n";
}
inputfile
@SRR006331.1 201WGAAXX:1:1:782:676 length=36
TTTTAGGTCAAAATCAGTTCTATTAGCAACTCCTAA
+SRR006331.1 201WGAAXX:1:1:782:676 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@SRR006331.2 201WGAAXX:1:1:821:689 length=36
TCTTAAGTTTTTTGCATATTCATAATTATATAAATC
+SRR006331.2 201WGAAXX:1:1:821:689 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@SRR006331.3 201WGAAXX:1:1:845:648 length=36
TTTTATTGCCATTTTTGTCAACAAAATGTATTCCAT
+SRR006331.3 201WGAAXX:1:1:845:648 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@SRR006331.4 201WGAAXX:1:1:863:661 length=36
TATGAAGTTACTAAAAAAATAATTATTGATAAGATT
+SRR006331.4 201WGAAXX:1:1:863:661 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@SRR006331.5 201WGAAXX:1:1:521:222 length=36
TAATAATAACTAGTTCATCTTTAGCCCTAGTTATAG
+SRR006331.5 201WGAAXX:1:1:521:222 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@SRR006331.6 201WGAAXX:1:1:774:423 length=36
TTAACTAAATCCTCTTGATTTATAGAGGCTTTTTTT
+SRR006331.6 201WGAAXX:1:1:774:423 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@SRR006331.7 201WGAAXX:1:1:658:111 length=36
TACTTTATCACCATTAAAATAAAGCTGATTTTCTGA
+SRR006331.7 201WGAAXX:1:1:658:111 length=36
AIIIIIIIIIIIIIIAIIIIIIIIIIIIIIIIIIII
@SRR006331.8 201WGAAXX:1:1:413:670 length=36
TTTATCACATTTATCTAATTCTTCATATTGTGCTTT
+SRR006331.8 201WGAAXX:1:1:413:670 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
@SRR006369.8 201WGAAXX:1:1:413:670 length=36
TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT
+SRR006369.8 201WGAAXX:1:1:413:670 length=36
IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII
With the input file named test.txt, run the script as
perl fastq_qualquartileBIN.pl -s Y test.txt testing.out