http://www.perlmonks.org?node_id=863885


in reply to Re: how to process very large tab limited pileup file
in thread how to process very large tab limited pileup file

Thanks a lot "graff" your inputs were quite useful. I could be able to write the code. (suggestions for the improvements are welcome) This code take the inputs: 1. raw pileup file and the 2. consensus fasta sequence(here it is named as "input.fasta"). and modifies the consensus fasta sequence based on the nucleotide frequences (if above 5%) at each position (from the read column (column 9). It also includes deletions if they above 5% in indel line (i.e. lines with * in the column 3)
>input.fasta GATCCccagagcgttGAwgcttAgacTCCgagc
Find the complete code below: usage: perl modify.pileup.pl pileup2markedfasta input.pileup
#!/usr/bin/perl use strict; use warnings; &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (pileup2markedfasta=>\&pileup2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; exit(0); # pileup2markedfasta sub pileup2fq { die(qq/ Usage: modify.pileup.pl pileup2marked.fasta [options] <in.cns-pileup +> /) if (@ARGV == 0 && -t STDIN); my %hash = ( 1 => 'A', 2 => 'T', 3 => 'G', 4 => 'C', 12 => 'W', 21 => 'W', 13 => 'R', 31 => 'R', 14 => 'M', 41 => 'M', 23 => 'K', 32 => 'K', 24 => 'Y', 42 => 'Y', 34 => 'S', 43 => 'S' ); my $seq; my $ConsFileName = "input.fasta" ; # Input single chromosome consensus + fasta file open(INFILE, "$ConsFileName"); my $header ; while(my $line = <INFILE>) { if ($line =~ m/^\>/){ $header = $line; # Fasta header is removed; otherwise counted } else{ chomp $line; # Remove the "\n" $seq .= $line; } } close(INFILE); print "$header"; print "$seq\n"; my @gps; my $dsize ; my $rseq = ''; my $report = ''; my @t; while (<>) { @t = split; if ($t[7] < 7){substr($seq, ($t[1]-1), 1, "N");} # minimum number ( +7) of reads elsif ($t[2] eq '*') { # checking if it is indel in the raw pile +up if (($t[3] =~ /\*\/\+/)||($t[3] =~ /^\+/)){ #print "next $t[3] \n"; next; } elsif ( ($t[3] =~ /\*\/\*/) && ($t[9] =~ /^\-/) && ($t[11]/$t[7] > +=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1]: $lenn, @gps, dash** $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif ( ($t[3] =~ /\*\/\-/) && ($t[11]/$t[7] >=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] dash*- $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif (($t[3] =~ /^\-/) && ($t[10]/$t[7] >=0.05)){ my $lenn =(length($t[8])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] 'dash-' $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } } else { $dsize = scalar(@gps); if ($dsize >=1){shift @gps; #print "$dsize: $t[1]skippped\n"; next; } else { my $countref = ($t[8] =~ tr/.,//); if ( $countref / $t[7] > 0.95 ){next;} elsif ($countref/$t[7] <=0.95){ $t[8] =~ s/[+-][0-9]+[atgcrykmswbdhvnATGCRYKMSWBDHVN]+//g; + # removes unwanted indels from read base lines $t[8] =~ s/[\@\'\&\!\^\$\*\)]+//g; # removes garbage symbo +ls $t[8] =~ s/[.,]/$t[2]/g; my %ratio; my @letters = qw/a t g c/; $report = " $t[1] : "; for my $p ( @letters ) { #`echo "$t[1]"`; my $count = eval "lc( '$t[8]' ) =~ tr/$p//"; $ratio{$p} = $count / $t[7]; # print "$p: $ratio{$p}\n"; $report .= sprintf( "%s %5.3f ", uc( $p ), $ratio{$p} ); } my $countnut = 0; my @ncode =(); for my $i (0 .. $#letters) { if ( $ratio{ $letters[$i] } >= 0.05 ) { push @ncode, $i + 1; #print "$t[1], @ncode, $letters[$i]\n"; $countnut++; } } my $ccode = join ("", @ncode); chomp $ccode; if ( ! exists( $hash{$ccode} )) { # print "not $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "N"); } else { # print "yes $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "$hash{$ccode}"); } } } # printf( "Data_line %5d : t1= %5s ; report= %s \n", $., $t[1], $r +eport ); } } # end of while print "$header"; # printing back the fasta header $seq= uc($seq) ; # print "$seq\n"; ; &p2q_post_process(\$seq) ; #print "$header"; # printing back the fasta header } # end of pileup ################## sub p2q_post_process { my ($seq) = @_; print &p2q_print_str($seq); } # sub p2q_print_str { my ($s) = @_; my $l = length($$s); for (my $i = 0; $i < $l; $i += 60) { print substr($$s, $i, 60), "\n"; } } # sub usage { die(qq/ Program: modify.pileup.pl Usage: modify.pileup.pl <command> [<arguments>]\n Command: pileup2markedfasta generate maked.fasta from `pileup -c r +aw pileup' \n/); }
input.pileup
2L 1 G A 48 48 26 7 AAaaAAA ACCCD?@ 2L 2 A A 48 0 26 7 ..,,... AACC@BC 2L 3 T K 18 18 26 7 .Ggg... ACCCCCC 2L 4 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBCC +BBC@3CCBBD=? 2L 5 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CDC +CDACB9CCA@B?@D 2L 6 c C 78 0 37 17 .,..,,.,,.,,,,,,, CBCCB +AC>9CBB@BA?A 2L 7 c C 45 0 37 19 .,..,,.,,.+1A,,,,t,,^F,^Ft + CDCCDAC>;CBCBD*?DAA 2L 7 * */* 55 0 37 19 * A 18 1 0 + 0 0 2L 8 a A 162 0 21 45 ,-4gtct,-4gtct,,....,,,,, +,.,.,,,,.,,,,,,,,...,.,.,,,,.,,, CCCCA<;=/CCACDBCBCCCCDCCCCBBCCBDC +CCCCDCDDCCAB 2L 8 * */* 173 0 21 45 * -gtct 43 3 + 0 0 0 2L 9 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F, +CDCCCAC?BCBBCD?<DBBB 2L 10 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C +BCDABC>BCACBB:AC?4C 2L 11 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F. + CDCCCAC@@CDDCC<:DBABC 2L 12 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,... +.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC 2L 12 * */-gt 21 21 14 31 * -gt 30 1 + 0 0 0 2L 13 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,. +CDCCCAC?:CDB5?<>BB>BC 2L 14 t T 45 0 37 6 ..,,,.-1T CCCCCC 2L 14 * -t/-t 56 59 37 6 -t * 1 5 +0 0 0 2L 15 t T 93 0 37 22 .,..,,.,,.,,,G,,,,,,.^F. + CDCDDDC<@CBB5B@ABC<ACC 2L 16 G G 178 0 36 50 .$,$,$,$,,,,,.,,,,..,.,, +...,,,,.,,...,....,...,,,$,,,., BCCCCCCCCCCCCBDBCBCCB>8CCCCBCCBDBC +8>ACCBB6CC?CCC6C 2L 17 A A 59 0 36 45 ,$,$,,,C,,,c..,.,,C..,,,, +.,,...,....,..C,,,,,$., CCCCCBCCCBB8CACC>@;CCCCDCCCDAC-5ABCAA2CCCC +?1A 2L 18 w T 54 54 37 9 tTTTTttTT CCDCCDCCC 2L 18 * A/+A 65 279 37 9 A * 8 1 0 + 0 0 2L 19 g G 54 0 37 9 ....,,... >>@@CC>BB 2L 20 c C 57 0 37 10 ....,,...^F, CACCCC?CB +B 2L 21 t T 57 0 37 10 ....,,..., BCDCCCCCB? 2L 22 t T 48 0 34 7 ..,.,,+4caaa,+4caaa C?C +CA=? 2L 22 * */* 9 0 34 7 * +CAAA 5 2 0 + 0 0 2L 23 A A 48 0 34 7 .$.$,.,+1g,, DBC?CCA 2L 23 * */* 19 0 34 7 * +G 6 1 0 + 0 0 2L 24 g G 60 0 14 33 .,...,,.-13GGCGCGCGTGCGC. +,,A,,A,,A,..........aa.. DCBBBBDBCCCCBCCCDCBDCBCCCACCBABCC 2L 24 * */* 61 0 14 33 * -ggcgcgcgtgcgc +32 1 0 0 0 2L 25 a A 163 0 15 49 ..$,..,,,t,.T,,,..,,,,T, +-7attattt,,-7attattt,,,,,,....,,......,.,,.. BBCA>BCCCC:CCCC>ACCCC +BCCCCBCCCCCCDCCCCCCCCBCBCDCC 2L 25 * */-attattt 27 27 15 49 * -attattt + 47 2 0 0 0 2L 26 c C 32 0 0 11 ,,-4tctt,,.-4TCTT...-4TCTT +.^!.^!, CCC8CCCCBCA 2L 26 * */-tctt 17 17 0 11 * -tctt 8 +3 0 0 0 2L 27 T K 18 18 26 7 .Ggg... ACCCCCC 2L 28 C C 57 0 37 16 .,..,,.,a.,,,,,^F, CBC +CBBC@3CCBBD=? 2L 29 C C 54 0 37 17 .,..,,.,a.,,,,,,^F, CD +CCDACB9CCA@B?@D 2L 30 g G 87 0 37 20 .,..,,.,,.,,,,,,,,,^F, + CDCCCAC?BCBBCD?<DBBB 2L 31 a A 87 0 37 20 .,..,,.,,.,,,,,,,,,, C +BCDABC>BCACBB:AC?4C 2L 32 g G 90 0 37 21 .,..,,.,,.,,,,,,,,,,^F. + CDCCCAC@@CDDCC<:DBABC 2L 33 c C 117 0 14 31 ...,,*.,,-2gt.,,.,,.,... +.......,,.. ;B6CCBBCBCACDCCCB@DCDCC8CCCABCC
Output:
>input.fasta GATCCccagagcgttGAwgcttAgacTCCgagc >input.fasta AAKMMCYA----GNTGMTGCTTARWC----AGC