#!/usr/bin/perl use strict; use warnings; &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (pileup2markedfasta=>\&pileup2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; exit(0); # pileup2markedfasta sub pileup2fq { die(qq/ Usage: modify.pileup.pl pileup2marked.fasta [options] /) if (@ARGV == 0 && -t STDIN); my %hash = ( 1 => 'A', 2 => 'T', 3 => 'G', 4 => 'C', 12 => 'W', 21 => 'W', 13 => 'R', 31 => 'R', 14 => 'M', 41 => 'M', 23 => 'K', 32 => 'K', 24 => 'Y', 42 => 'Y', 34 => 'S', 43 => 'S' ); my $seq; my $ConsFileName = "input.fasta" ; # Input single chromosome consensus fasta file open(INFILE, "$ConsFileName"); my $header ; while(my $line = ) { if ($line =~ m/^\>/){ $header = $line; # Fasta header is removed; otherwise counted } else{ chomp $line; # Remove the "\n" $seq .= $line; } } close(INFILE); print "$header"; print "$seq\n"; my @gps; my $dsize ; my $rseq = ''; my $report = ''; my @t; while (<>) { @t = split; if ($t[7] < 7){substr($seq, ($t[1]-1), 1, "N");} # minimum number (7) of reads elsif ($t[2] eq '*') { # checking if it is indel in the raw pileup if (($t[3] =~ /\*\/\+/)||($t[3] =~ /^\+/)){ #print "next $t[3] \n"; next; } elsif ( ($t[3] =~ /\*\/\*/) && ($t[9] =~ /^\-/) && ($t[11]/$t[7] >=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1]: $lenn, @gps, dash** $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif ( ($t[3] =~ /\*\/\-/) && ($t[11]/$t[7] >=0.05) ){ my $lenn =(length($t[9])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] dash*- $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } elsif (($t[3] =~ /^\-/) && ($t[10]/$t[7] >=0.05)){ my $lenn =(length($t[8])-1); for (1..$lenn){ push (@gps, '-') } my $ndash .= '-' x ($lenn); # print "$t[1] 'dash-' $ndash\n"; substr($seq, $t[1], $lenn, "$ndash"); } } else { $dsize = scalar(@gps); if ($dsize >=1){shift @gps; #print "$dsize: $t[1]skippped\n"; next; } else { my $countref = ($t[8] =~ tr/.,//); if ( $countref / $t[7] > 0.95 ){next;} elsif ($countref/$t[7] <=0.95){ $t[8] =~ s/[+-][0-9]+[atgcrykmswbdhvnATGCRYKMSWBDHVN]+//g; # removes unwanted indels from read base lines $t[8] =~ s/[\@\'\&\!\^\$\*\)]+//g; # removes garbage symbols $t[8] =~ s/[.,]/$t[2]/g; my %ratio; my @letters = qw/a t g c/; $report = " $t[1] : "; for my $p ( @letters ) { #`echo "$t[1]"`; my $count = eval "lc( '$t[8]' ) =~ tr/$p//"; $ratio{$p} = $count / $t[7]; # print "$p: $ratio{$p}\n"; $report .= sprintf( "%s %5.3f ", uc( $p ), $ratio{$p} ); } my $countnut = 0; my @ncode =(); for my $i (0 .. $#letters) { if ( $ratio{ $letters[$i] } >= 0.05 ) { push @ncode, $i + 1; #print "$t[1], @ncode, $letters[$i]\n"; $countnut++; } } my $ccode = join ("", @ncode); chomp $ccode; if ( ! exists( $hash{$ccode} )) { # print "not $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "N"); } else { # print "yes $t[1], $ccode\n"; substr($seq, ($t[1]-1), 1, "$hash{$ccode}"); } } } # printf( "Data_line %5d : t1= %5s ; report= %s \n", $., $t[1], $report ); } } # end of while print "$header"; # printing back the fasta header $seq= uc($seq) ; # print "$seq\n"; ; &p2q_post_process(\$seq) ; #print "$header"; # printing back the fasta header } # end of pileup ################## sub p2q_post_process { my ($seq) = @_; print &p2q_print_str($seq); } # sub p2q_print_str { my ($s) = @_; my $l = length($$s); for (my $i = 0; $i < $l; $i += 60) { print substr($$s, $i, 60), "\n"; } } # sub usage { die(qq/ Program: modify.pileup.pl Usage: modify.pileup.pl []\n Command: pileup2markedfasta generate maked.fasta from `pileup -c raw pileup' \n/); }