#!/usr/bin/perl5.8.8 use strict; use warnings; my $familyName; #stores the family name open (my $OUTFILE,">","FamDATA") || die $!; #opens outfile that will contain the protein families and all their associated sequences with an e-value <= 0.001 print $OUTFILE "family\ttarget,e-value,bit-score"; #header for the outfile tempProteinFamFileCreator(); #creates a temp file with protein family sends it to MUSCLE and HMMER, and then sends table output to sub TableParse to extract necessary data (this includes the protein families and all their associated sequences with an e-value <= 0, the actual e-value, and the bit-score) close ($OUTFILE); #closes the outfile #creates a temp file with protein family sends it to MUSCLE and HMMER, and then sends table output to sub TableParse to extract necessary data (this includes the protein families and all their associated sequences with an e-value <= 0, the actual e-value, and the bit-score sub tempProteinFamFileCreator { my $infile = $ARGV[0]."_ProFam"; #captures the name of the infiles (Protein Family Files) as determined by the STNDIN open (my $INFILE,"<", $infile) || die $!; #opens the infile (Protein Family Files) open (my $TEMP,">",'temp') || die $!; #opens the temp file which will store a single protein family at a time. while(<$INFILE>) #entering the infile { my $line = $_; #sets the default iterator to a value if ($line =~/^##(UniRef90_[\w\d]+) Protein Family/) { close ($TEMP); #closes temp file MuscleHMMERsearch(); #enters subroutine MuscleHMMERsearch open ($TEMP,">",'temp'); #opens temp file seek ($TEMP, 0 ,0); #used to overwrite the temp file each time, by returning to the beginning of the file $familyName = $1; #storing the protein family name } if ($line =~/^>UniRef90_[\w\d]+\.UniRef100_[\w\d]+/ || $line =~/^[\w\d]+/) { chomp $_; #removes any remaining white space from around the default iterator print $TEMP "$_\n"; #prints to temp file } if ($line =~/^>File/) { close ($TEMP); #closes temp file MuscleHMMERsearch(); #enters subroutine MuscleHMMERsearch } } #end of while loop close ($INFILE); #closes the $INFILE handle } #end of subroutine tempProteinFamFileCreator #temp file (single protein family) is sent to MUSCLE, hmmbuild, hmmsearch and creates a table. sub MuscleHMMERsearch { open (my $TEMP,"<",'temp') || die $!; #opens temp file (single protein family) if (-s "/home/vcg/Documents/Trial/temp") #checks to see if file is not empty { # MUSCLE creating a multiple alignment file my $Fprot = "temp"; my $MAF = "MuscleAlignmentFile"; my $cmd = "/home/vcg/Documents/Trial/muscle -in $Fprot -out $MAF"; print $cmd. "\n"; system ($cmd); if ($?) {die "command $cmd failed\n"}; # HMMER creating a profile HMM from a multiple alignment file my $HMMbf = "HMMbuildfile"; $cmd = "/home/vcg/Documents/Trial/hmmbuild --informat afa $HMMbf $MAF"; print $cmd. "\n"; system ($cmd); if ($?) {die "command $cmd failed\n"}; $cmd = "rm /home/vcg/Documents/Trial/$MAF"; system ($cmd); # Searching the sequence database with hmmsearch my $results = "HMMsearch_results"; $cmd = "/home/vcg/Documents/Trial/hmmsearch --F1 .002 --F2 .001 --F3 .00001 --tblout HMMtable.tbl --cpu 1 -E .001 $HMMbf uniref100.fasta >> $results"; print $cmd. "\n"; system ($cmd); if ($?) {die "command $cmd failed\n"}; $cmd = "rm /home/vcg/Documents/Trial/$HMMbf"; system ($cmd); TableParser(); #enters subroutine TableParser } } #end of subroutine MuscleHMMERsearch #table output created in sub MuscleHMMERsearch, is parsed, and the necessary data is extract and sent/appended to a resulting file. This includes the protein families and all their associated sequences with an e-value <= 0, the actual e-value, and the bit-score. sub TableParser { open (my $TABLE,"<","HMMtable.tbl") || die $!; #opens table ouput created in sub MuscleHMMERsearch print $OUTFILE "\n$familyName\t"; #prints the protein family name while(<$TABLE>) #space delimited file\ #enters the table file { if ($_=~/^#/){next;} #bypasses headers my @data = split(" ", $_); #the table file is space delimited. All the data is divided by a space and stored in an array. if ($_=~/^UniRef100/) { my $TargetName = $data[0]; #stores target name/ sequences with e-value <= 0.001 my $E_value = $data[4]; #stores the e-value of that sequence my $BitScore = $data[5]; #stores the bit-score of that sequence print $OUTFILE "$TargetName,$E_value,$BitScore\t"; #prints all this data to a resulting file } } close ($TABLE); #closes $TABLE filehandle } # end of subroutine TableParser