Beefy Boxes and Bandwidth Generously Provided by pair Networks
The stupid question is the question not asked
 
PerlMonks  

Re: how to process very large tab limited pileup file

by umasuresh (Hermit)
on Oct 02, 2010 at 13:59 UTC ( #863077=note: print w/ replies, xml ) Need Help??


in reply to how to process very large tab limited pileup file

Can you post the desired output format? This might help in better understanding the problem.


Comment on Re: how to process very large tab limited pileup file
Re^2: how to process very large tab limited pileup file
by gudluck (Novice) on Oct 06, 2010 at 23:20 UTC
    Thanks a lot "graff" your inputs were quite useful. I could be able to write the code. (suggestions for the improvements are welcome) This code take the inputs: 1. raw pileup file and the 2. consensus fasta sequence(here it is named as "input.fasta"). and modifies the consensus fasta sequence based on the nucleotide frequences (if above 5%) at each position (from the read column (column 9). It also includes deletions if they above 5% in indel line (i.e. lines with * in the column 3)
    >input.fasta GATCCccagagcgttGAwgcttAgacTCCgagc
    Find the complete code below: usage: perl modify.pileup.pl pileup2markedfasta input.pileup
    #!/usr/bin/perl use strict; use warnings; &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (pileup2markedfasta=>\&pileup2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; exit(0); # pileup2markedfasta sub pileup2fq { die(qq/ Usage: modify.pileup.pl pileup2marked.fasta [options] <in.cns-pileup +> /) if (@ARGV == 0 && -t STDIN); my %hash = ( 1 => 'A', 2 => 'T', 3 => 'G', 4 => 'C', 12 => 'W', 21 => 'W', 13 => 'R', 31 => 'R', 14 => 'M', 41 => 'M', 23 => 'K', 32 => 'K', 24 => 'Y', 42 => 'Y', 34 => 'S', 43 => 'S' ); my $seq; my $ConsFileName = "input.fasta" ; # Input single chromosome consensus + fasta file open(INFILE, "$ConsFileName"); my $header ; while(my $line = <INFILE>) { if ($line =~ m/^\>/){ $header = $line; # Fasta header is removed; otherwise counted } else{ chomp $line; # Remove the "\n" $seq .= $line; } } close(INFILE); print "$header"; print "$seq\n"; my @gps; my $dsize ; my $rseq = ''; my $report = ''; my @t; while (<>) { @t = split; if ($t[7] < 7){substr($seq, ($t[1]-1), 1, "N");} # minimum number ( +7) of reads elsif ($t[2] eq '*') { # checking if it is indel in the raw pile +up if (($t[3] =~ /\*\/\+/)||($t[3] =~ /^\+/)){ #print "next $t[3] \n"; next; } elsif ( ($t[3] =~ /\*\/\*/) && ($t[9] =~ /^\-/) && ($t[11]/$t[7] > +=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1]: $lenn, @gps, dash** $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif ( ($t[3] =~ /\*\/\-/) && ($t[11]/$t[7] >=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] dash*- $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif (($t[3] =~ /^\-/) && ($t[10]/$t[7] >=0.05)){ my $lenn =(length($t[8])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] 'dash-' $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } } else { $dsize = scalar(@gps); if ($dsize >=1){shift @gps; #print "$dsize: $t[1]skippped\n"; next; } else { my $countref = ($t[8] =~ tr/.,//); if ( $countref / $t[7] > 0.95 ){next;} elsif ($countref/$t[7] <=0.95){ $t[8] =~ s/[+-][0-9]+[atgcrykmswbdhvnATGCRYKMSWBDHVN]+//g; + # removes unwanted indels from read base lines $t[8] =~ s/[\@\'\&\!\^\$\*\)]+//g; # removes garbage symbo +ls $t[8] =~ s/[.,]/$t[2]/g; my %ratio; my @letters = qw/a t g c/; $report = " $t[1] : "; for my $p ( @letters ) { #`echo "$t[1]"`; my $count = eval "lc( '$t[8]' ) =~ tr/$p//"; $ratio{$p} = $count / $t[7]; # print "$p: $ratio{$p}\n"; $report .= sprintf( "%s %5.3f ", uc( $p ), $ratio{$p} ); } my $countnut = 0; my @ncode =(); for my $i (0 .. $#letters) { if ( $ratio{ $letters[$i] } >= 0.05 ) { push @ncode, $i + 1; #print "$t[1], @ncode, $letters[$i]\n"; $countnut++; } } my $ccode = join ("", @ncode); chomp $ccode; if ( ! exists( $hash{$ccode} )) { # print "not $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "N"); } else { # print "yes $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "$hash{$ccode}"); } } } # printf( "Data_line %5d : t1= %5s ; report= %s \n", $., $t[1], $r +eport ); } } # end of while print "$header"; # printing back the fasta header $seq= uc($seq) ; # print "$seq\n"; ; &p2q_post_process(\$seq) ; #print "$header"; # printing back the fasta header } # end of pileup ################## sub p2q_post_process { my ($seq) = @_; print &p2q_print_str($seq); } # sub p2q_print_str { my ($s) = @_; my $l = length($$s); for (my $i = 0; $i < $l; $i += 60) { print substr($$s, $i, 60), "\n"; } } # sub usage { die(qq/ Program: modify.pileup.pl Usage: modify.pileup.pl <command> [<arguments>]\n Command: pileup2markedfasta generate maked.fasta from `pileup -c r +aw pileup' \n/); }
    input.pileup
    2L 1 G A 48 48 26 7 AAaaAAA ACCCD?@ 2L 2 A A 48 0 26 7 ..,,... AACC@BC 2L 3 T K 18 18 26 7 .Ggg... ACCCCCC 2L 4 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBCC +BBC@3CCBBD=? 2L 5 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CDC +CDACB9CCA@B?@D 2L 6 c C 78 0 37 17 .,..,,.,,.,,,,,,, CBCCB +AC>9CBB@BA?A 2L 7 c C 45 0 37 19 .,..,,.,,.+1A,,,,t,,^F,^Ft + CDCCDAC>;CBCBD*?DAA 2L 7 * */* 55 0 37 19 * A 18 1 0 + 0 0 2L 8 a A 162 0 21 45 ,-4gtct,-4gtct,,....,,,,, +,.,.,,,,.,,,,,,,,...,.,.,,,,.,,, CCCCA<;=/CCACDBCBCCCCDCCCCBBCCBDC +CCCCDCDDCCAB 2L 8 * */* 173 0 21 45 * -gtct 43 3 + 0 0 0 2L 9 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F, +CDCCCAC?BCBBCD?<DBBB 2L 10 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C +BCDABC>BCACBB:AC?4C 2L 11 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F. + CDCCCAC@@CDDCC<:DBABC 2L 12 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,... +.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC 2L 12 * */-gt 21 21 14 31 * -gt 30 1 + 0 0 0 2L 13 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,. +CDCCCAC?:CDB5?<>BB>BC 2L 14 t T 45 0 37 6 ..,,,.-1T CCCCCC 2L 14 * -t/-t 56 59 37 6 -t * 1 5 +0 0 0 2L 15 t T 93 0 37 22 .,..,,.,,.,,,G,,,,,,.^F. + CDCDDDC<@CBB5B@ABC<ACC 2L 16 G G 178 0 36 50 .$,$,$,$,,,,,.,,,,..,.,, +...,,,,.,,...,....,...,,,$,,,., BCCCCCCCCCCCCBDBCBCCB>8CCCCBCCBDBC +8>ACCBB6CC?CCC6C 2L 17 A A 59 0 36 45 ,$,$,,,C,,,c..,.,,C..,,,, +.,,...,....,..C,,,,,$., CCCCCBCCCBB8CACC>@;CCCCDCCCDAC-5ABCAA2CCCC +?1A 2L 18 w T 54 54 37 9 tTTTTttTT CCDCCDCCC 2L 18 * A/+A 65 279 37 9 A * 8 1 0 + 0 0 2L 19 g G 54 0 37 9 ....,,... >>@@CC>BB 2L 20 c C 57 0 37 10 ....,,...^F, CACCCC?CB +B 2L 21 t T 57 0 37 10 ....,,..., BCDCCCCCB? 2L 22 t T 48 0 34 7 ..,.,,+4caaa,+4caaa C?C +CA=? 2L 22 * */* 9 0 34 7 * +CAAA 5 2 0 + 0 0 2L 23 A A 48 0 34 7 .$.$,.,+1g,, DBC?CCA 2L 23 * */* 19 0 34 7 * +G 6 1 0 + 0 0 2L 24 g G 60 0 14 33 .,...,,.-13GGCGCGCGTGCGC. +,,A,,A,,A,..........aa.. DCBBBBDBCCCCBCCCDCBDCBCCCACCBABCC 2L 24 * */* 61 0 14 33 * -ggcgcgcgtgcgc +32 1 0 0 0 2L 25 a A 163 0 15 49 ..$,..,,,t,.T,,,..,,,,T, +-7attattt,,-7attattt,,,,,,....,,......,.,,.. BBCA>BCCCC:CCCC>ACCCC +BCCCCBCCCCCCDCCCCCCCCBCBCDCC 2L 25 * */-attattt 27 27 15 49 * -attattt + 47 2 0 0 0 2L 26 c C 32 0 0 11 ,,-4tctt,,.-4TCTT...-4TCTT +.^!.^!, CCC8CCCCBCA 2L 26 * */-tctt 17 17 0 11 * -tctt 8 +3 0 0 0 2L 27 T K 18 18 26 7 .Ggg... ACCCCCC 2L 28 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBC +CBBC@3CCBBD=? 2L 29 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CD +CCDACB9CCA@B?@D 2L 30 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F, + CDCCCAC?BCBBCD?<DBBB 2L 31 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C +BCDABC>BCACBB:AC?4C 2L 32 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F. + CDCCCAC@@CDDCC<:DBABC 2L 33 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,... +.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC
    Output:
    >input.fasta GATCCccagagcgttGAwgcttAgacTCCgagc >input.fasta AAKMMCYA----GNTGMTGCTTARWC----AGC

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://863077]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others making s'mores by the fire in the courtyard of the Monastery: (11)
As of 2014-09-18 23:52 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    How do you remember the number of days in each month?











    Results (128 votes), past polls