Beefy Boxes and Bandwidth Generously Provided by pair Networks
"be consistent"
 
PerlMonks  

Re^2: how to process very large tab limited pileup file

by gudluck (Novice)
on Oct 06, 2010 at 23:20 UTC ( #863885=note: print w/ replies, xml ) Need Help??


in reply to Re: how to process very large tab limited pileup file
in thread how to process very large tab limited pileup file

Thanks a lot "graff" your inputs were quite useful. I could be able to write the code. (suggestions for the improvements are welcome) This code take the inputs: 1. raw pileup file and the 2. consensus fasta sequence(here it is named as "input.fasta"). and modifies the consensus fasta sequence based on the nucleotide frequences (if above 5%) at each position (from the read column (column 9). It also includes deletions if they above 5% in indel line (i.e. lines with * in the column 3)

>input.fasta GATCCccagagcgttGAwgcttAgacTCCgagc
Find the complete code below: usage: perl modify.pileup.pl pileup2markedfasta input.pileup
#!/usr/bin/perl use strict; use warnings; &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (pileup2markedfasta=>\&pileup2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; exit(0); # pileup2markedfasta sub pileup2fq { die(qq/ Usage: modify.pileup.pl pileup2marked.fasta [options] <in.cns-pileup +> /) if (@ARGV == 0 && -t STDIN); my %hash = ( 1 => 'A', 2 => 'T', 3 => 'G', 4 => 'C', 12 => 'W', 21 => 'W', 13 => 'R', 31 => 'R', 14 => 'M', 41 => 'M', 23 => 'K', 32 => 'K', 24 => 'Y', 42 => 'Y', 34 => 'S', 43 => 'S' ); my $seq; my $ConsFileName = "input.fasta" ; # Input single chromosome consensus + fasta file open(INFILE, "$ConsFileName"); my $header ; while(my $line = <INFILE>) { if ($line =~ m/^\>/){ $header = $line; # Fasta header is removed; otherwise counted } else{ chomp $line; # Remove the "\n" $seq .= $line; } } close(INFILE); print "$header"; print "$seq\n"; my @gps; my $dsize ; my $rseq = ''; my $report = ''; my @t; while (<>) { @t = split; if ($t[7] < 7){substr($seq, ($t[1]-1), 1, "N");} # minimum number ( +7) of reads elsif ($t[2] eq '*') { # checking if it is indel in the raw pile +up if (($t[3] =~ /\*\/\+/)||($t[3] =~ /^\+/)){ #print "next $t[3] \n"; next; } elsif ( ($t[3] =~ /\*\/\*/) && ($t[9] =~ /^\-/) && ($t[11]/$t[7] > +=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1]: $lenn, @gps, dash** $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif ( ($t[3] =~ /\*\/\-/) && ($t[11]/$t[7] >=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] dash*- $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif (($t[3] =~ /^\-/) && ($t[10]/$t[7] >=0.05)){ my $lenn =(length($t[8])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] 'dash-' $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } } else { $dsize = scalar(@gps); if ($dsize >=1){shift @gps; #print "$dsize: $t[1]skippped\n"; next; } else { my $countref = ($t[8] =~ tr/.,//); if ( $countref / $t[7] > 0.95 ){next;} elsif ($countref/$t[7] <=0.95){ $t[8] =~ s/[+-][0-9]+[atgcrykmswbdhvnATGCRYKMSWBDHVN]+//g; + # removes unwanted indels from read base lines $t[8] =~ s/[\@\'\&\!\^\$\*\)]+//g; # removes garbage symbo +ls $t[8] =~ s/[.,]/$t[2]/g; my %ratio; my @letters = qw/a t g c/; $report = " $t[1] : "; for my $p ( @letters ) { #`echo "$t[1]"`; my $count = eval "lc( '$t[8]' ) =~ tr/$p//"; $ratio{$p} = $count / $t[7]; # print "$p: $ratio{$p}\n"; $report .= sprintf( "%s %5.3f ", uc( $p ), $ratio{$p} ); } my $countnut = 0; my @ncode =(); for my $i (0 .. $#letters) { if ( $ratio{ $letters[$i] } >= 0.05 ) { push @ncode, $i + 1; #print "$t[1], @ncode, $letters[$i]\n"; $countnut++; } } my $ccode = join ("", @ncode); chomp $ccode; if ( ! exists( $hash{$ccode} )) { # print "not $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "N"); } else { # print "yes $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "$hash{$ccode}"); } } } # printf( "Data_line %5d : t1= %5s ; report= %s \n", $., $t[1], $r +eport ); } } # end of while print "$header"; # printing back the fasta header $seq= uc($seq) ; # print "$seq\n"; ; &p2q_post_process(\$seq) ; #print "$header"; # printing back the fasta header } # end of pileup ################## sub p2q_post_process { my ($seq) = @_; print &p2q_print_str($seq); } # sub p2q_print_str { my ($s) = @_; my $l = length($$s); for (my $i = 0; $i < $l; $i += 60) { print substr($$s, $i, 60), "\n"; } } # sub usage { die(qq/ Program: modify.pileup.pl Usage: modify.pileup.pl <command> [<arguments>]\n Command: pileup2markedfasta generate maked.fasta from `pileup -c r +aw pileup' \n/); }
input.pileup
2L 1 G A 48 48 26 7 AAaaAAA ACCCD?@ 2L 2 A A 48 0 26 7 ..,,... AACC@BC 2L 3 T K 18 18 26 7 .Ggg... ACCCCCC 2L 4 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBCC +BBC@3CCBBD=? 2L 5 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CDC +CDACB9CCA@B?@D 2L 6 c C 78 0 37 17 .,..,,.,,.,,,,,,, CBCCB +AC>9CBB@BA?A 2L 7 c C 45 0 37 19 .,..,,.,,.+1A,,,,t,,^F,^Ft + CDCCDAC>;CBCBD*?DAA 2L 7 * */* 55 0 37 19 * A 18 1 0 + 0 0 2L 8 a A 162 0 21 45 ,-4gtct,-4gtct,,....,,,,, +,.,.,,,,.,,,,,,,,...,.,.,,,,.,,, CCCCA<;=/CCACDBCBCCCCDCCCCBBCCBDC +CCCCDCDDCCAB 2L 8 * */* 173 0 21 45 * -gtct 43 3 + 0 0 0 2L 9 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F, +CDCCCAC?BCBBCD?<DBBB 2L 10 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C +BCDABC>BCACBB:AC?4C 2L 11 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F. + CDCCCAC@@CDDCC<:DBABC 2L 12 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,... +.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC 2L 12 * */-gt 21 21 14 31 * -gt 30 1 + 0 0 0 2L 13 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,. +CDCCCAC?:CDB5?<>BB>BC 2L 14 t T 45 0 37 6 ..,,,.-1T CCCCCC 2L 14 * -t/-t 56 59 37 6 -t * 1 5 +0 0 0 2L 15 t T 93 0 37 22 .,..,,.,,.,,,G,,,,,,.^F. + CDCDDDC<@CBB5B@ABC<ACC 2L 16 G G 178 0 36 50 .$,$,$,$,,,,,.,,,,..,.,, +...,,,,.,,...,....,...,,,$,,,., BCCCCCCCCCCCCBDBCBCCB>8CCCCBCCBDBC +8>ACCBB6CC?CCC6C 2L 17 A A 59 0 36 45 ,$,$,,,C,,,c..,.,,C..,,,, +.,,...,....,..C,,,,,$., CCCCCBCCCBB8CACC>@;CCCCDCCCDAC-5ABCAA2CCCC +?1A 2L 18 w T 54 54 37 9 tTTTTttTT CCDCCDCCC 2L 18 * A/+A 65 279 37 9 A * 8 1 0 + 0 0 2L 19 g G 54 0 37 9 ....,,... >>@@CC>BB 2L 20 c C 57 0 37 10 ....,,...^F, CACCCC?CB +B 2L 21 t T 57 0 37 10 ....,,..., BCDCCCCCB? 2L 22 t T 48 0 34 7 ..,.,,+4caaa,+4caaa C?C +CA=? 2L 22 * */* 9 0 34 7 * +CAAA 5 2 0 + 0 0 2L 23 A A 48 0 34 7 .$.$,.,+1g,, DBC?CCA 2L 23 * */* 19 0 34 7 * +G 6 1 0 + 0 0 2L 24 g G 60 0 14 33 .,...,,.-13GGCGCGCGTGCGC. +,,A,,A,,A,..........aa.. DCBBBBDBCCCCBCCCDCBDCBCCCACCBABCC 2L 24 * */* 61 0 14 33 * -ggcgcgcgtgcgc +32 1 0 0 0 2L 25 a A 163 0 15 49 ..$,..,,,t,.T,,,..,,,,T, +-7attattt,,-7attattt,,,,,,....,,......,.,,.. BBCA>BCCCC:CCCC>ACCCC +BCCCCBCCCCCCDCCCCCCCCBCBCDCC 2L 25 * */-attattt 27 27 15 49 * -attattt + 47 2 0 0 0 2L 26 c C 32 0 0 11 ,,-4tctt,,.-4TCTT...-4TCTT +.^!.^!, CCC8CCCCBCA 2L 26 * */-tctt 17 17 0 11 * -tctt 8 +3 0 0 0 2L 27 T K 18 18 26 7 .Ggg... ACCCCCC 2L 28 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBC +CBBC@3CCBBD=? 2L 29 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CD +CCDACB9CCA@B?@D 2L 30 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F, + CDCCCAC?BCBBCD?<DBBB 2L 31 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C +BCDABC>BCACBB:AC?4C 2L 32 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F. + CDCCCAC@@CDDCC<:DBABC 2L 33 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,... +.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC
Output:
>input.fasta GATCCccagagcgttGAwgcttAgacTCCgagc >input.fasta AAKMMCYA----GNTGMTGCTTARWC----AGC


Comment on Re^2: how to process very large tab limited pileup file
Select or Download Code

Log In?
Username:
Password:

What's my password?
Create A New User
Node Status?
node history
Node Type: note [id://863885]
help
Chatterbox?
and the web crawler heard nothing...

How do I use this? | Other CB clients
Other Users?
Others meditating upon the Monastery: (12)
As of 2015-07-29 22:46 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    The top three priorities of my open tasks are (in descending order of likelihood to be worked on) ...









    Results (269 votes), past polls