Beefy Boxes and Bandwidth Generously Provided by pair Networks
Clear questions and runnable code
get the best and fastest answer
 
PerlMonks  

davi54's scratchpad

by davi54 (Sexton)
on Feb 21, 2019 at 21:30 UTC ( [id://1230332]=scratchpad: print w/replies, xml ) Need Help??

Test_duplicates.txt
>sp|C1JCT2|POLFS_SINV3 Polyprotein-FSD OS=Solenopsis invicta virus 3 OX=631345 PE=1 SV=2 MSEKTQTFVQNETHVLDMTSDFKSDLSLEKVTSSVEQTDDLVSKIINNNDLDIKDLSFLR NLLLSTLQYLG >sp|C1JCT2|POLFS_SINV3 Polyprotein-FSD OS=Solenopsis invicta virus 3 OX=631345 PE=1 SV=2 MSEKTQTFVQNETHVLDMTSDFKSDLSLEKVTSSVEQTDDLVSKIINNNDLDIKDLSFLR NLLLSTLQYLG


poj_duplicate.pl
#!/usr/bin/perl use strict; use warnings; my $report_name = 'sdAb_report.txt'; open my $out_file, '>', $report_name or die "Cannot open '$report_name' because: $!"; print 'PLEASE ENTER THE FILENAME OF THE PROTEIN SEQUENCE: '; chomp( my $prot_filename = <STDIN> ); open my $PROTFILE, '<', $prot_filename or die "Cannot open '$prot_filename' because: $!"; $/ = ''; # Set paragraph mode my @count=(); my %absent=(); my $name; my %fasta_seen; # sequences seen so far FASTA_RECORD: while ( my $para = <$PROTFILE> ) { # Remove fasta header line if ( $para =~ s/^>(.*)//m ){ $name = $1; }; # Remove comment line(s) $para =~ s/^\s*#.*//mg; # next FASTA_RECORD if $fasta_seen{ $para }++; if ( $fasta_seen{ $para }++ ){ print "DUPLICATE : $name \n $para\n"; next FASTA_RECORD; } my %prot; $para =~ s/(A-Z)/ ++$prot{ $1 } /eg; my $num = scalar keys %prot; push @count,$num,$name; # printf "Counted %d for %s ..\n",$num,substr($name,0,50); print $out_file "$name\n"; print $out_file join( ' ', map "$_=$prot{$_}", sort keys %prot ), "\n"; printf $out_file "Amino acid alphabet = %d\n\n",$num ; # count absent for ('A'..'Z'){ ++$absent{$_} unless exists $prot{$_}; }; }; # sort names by count in ascending order to get lowest my @sorted = sort { $a->[0] <=> $b->[0] } @count; my $lowest = $sorted[0]->[0]; # maybe more than 1 lowest printf $out_file "Least number of proteins is %d in these entries\n",$lowest; my @lowest = grep { $_->[0] == $lowest } @sorted; print $out_file "$_->1\n" for @lowest; # show all results print $out_file "\nAll results in ascending count\n"; for (@sorted){ printf $out_file "%d %s\n",@$_; }; close $out_file; print "\nResults are printed in $report_name\n"; # print absent counts print "\nNon-incorporation of various amino acids in $prot_filename is as follows\n"; for (sort keys %absent){ printf "%s=%d\n",$_,$absent{$_}; };
Log In?
Username:
Password:

What's my password?
Create A New User
Domain Nodelet?
Chatterbox?
and the web crawler heard nothing...

How do I use this?Last hourOther CB clients
Other Users?
Others sharing their wisdom with the Monastery: (2)
As of 2024-04-26 00:37 GMT
Sections?
Information?
Find Nodes?
Leftovers?
    Voting Booth?

    No recent polls found