<?xml version="1.0" encoding="windows-1252"?>
<node id="1006367" title="get full hit descrption from blast output (xml)" created="2012-11-29 16:58:44" updated="2012-11-29 16:58:44">
<type id="115">
perlquestion</type>
<author id="985772">
ejbiers</author>
<data>
<field name="doctext">
&lt;p&gt;Hello wise monks!&lt;/p&gt;
&lt;p&gt;I am trying to parse an xml file like the one below using the following script, and everything works well except that I cannot get the full hit description (the "Hit_def" parameter in this xml format).&lt;/p&gt;
&lt;p&gt;What I want that output to read is "43989.cce_0262 (Cyanothece ATCC 51142)", but my output only has "(Cyanothece ATCC 51142)". I played around a bit and found that if I put non-numerical text infront of this information, then I get the entire hit description. However, I'd like to be able to retrieve it without modifying the xml file. Any suggestions?&lt;/p&gt;
&lt;p&gt;Here is a sample of the xml input&lt;/p&gt;
&lt;code&gt;&lt;?xml version="1.0"?&gt;
&lt;!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd"&gt;
&lt;BlastOutput&gt;
  &lt;BlastOutput_program&gt;blastx&lt;/BlastOutput_program&gt;
  &lt;BlastOutput_version&gt;BLASTX 2.2.27+&lt;/BlastOutput_version&gt;
  &lt;BlastOutput_reference&gt;Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &amp;quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&amp;quot;, Nucleic Acids Res. 25:3389-3402.&lt;/BlastOutput_reference&gt;
  &lt;BlastOutput_db&gt;/Applications/blast-2.2.27+/db/COG_Nov2012/protein.sequences.v9.0&lt;/BlastOutput_db&gt;
  &lt;BlastOutput_query-ID&gt;Query_1&lt;/BlastOutput_query-ID&gt;
  &lt;BlastOutput_query-def&gt;HKUN3Y301D9XQX&lt;/BlastOutput_query-def&gt;
  &lt;BlastOutput_query-len&gt;508&lt;/BlastOutput_query-len&gt;
  &lt;BlastOutput_param&gt;
    &lt;Parameters&gt;
      &lt;Parameters_matrix&gt;BLOSUM62&lt;/Parameters_matrix&gt;
      &lt;Parameters_expect&gt;10&lt;/Parameters_expect&gt;
      &lt;Parameters_gap-open&gt;11&lt;/Parameters_gap-open&gt;
      &lt;Parameters_gap-extend&gt;1&lt;/Parameters_gap-extend&gt;
      &lt;Parameters_filter&gt;L;&lt;/Parameters_filter&gt;
    &lt;/Parameters&gt;
  &lt;/BlastOutput_param&gt;
  &lt;BlastOutput_iterations&gt;
    &lt;Iteration&gt;
      &lt;Iteration_iter-num&gt;1&lt;/Iteration_iter-num&gt;
      &lt;Iteration_query-ID&gt;Query_1&lt;/Iteration_query-ID&gt;
      &lt;Iteration_query-def&gt;HKUN3Y301D9XQX&lt;/Iteration_query-def&gt;
      &lt;Iteration_query-len&gt;508&lt;/Iteration_query-len&gt;
      &lt;Iteration_hits&gt;
        &lt;Hit&gt;
          &lt;Hit_num&gt;1&lt;/Hit_num&gt;
          &lt;Hit_id&gt;gnl|BL_ORD_ID|1515029&lt;/Hit_id&gt;
          &lt;Hit_def&gt;43989.cce_0262 (Cyanothece ATCC 51142)&lt;/Hit_def&gt;
          &lt;Hit_accession&gt;1515029&lt;/Hit_accession&gt;
          &lt;Hit_len&gt;65&lt;/Hit_len&gt;
          &lt;Hit_hsps&gt;
            &lt;Hsp&gt;
              &lt;Hsp_num&gt;1&lt;/Hsp_num&gt;
              &lt;Hsp_bit-score&gt;40.0466&lt;/Hsp_bit-score&gt;
              &lt;Hsp_score&gt;92&lt;/Hsp_score&gt;
              &lt;Hsp_evalue&gt;0.00664016&lt;/Hsp_evalue&gt;
              &lt;Hsp_query-from&gt;155&lt;/Hsp_query-from&gt;
              &lt;Hsp_query-to&gt;253&lt;/Hsp_query-to&gt;
              &lt;Hsp_hit-from&gt;12&lt;/Hsp_hit-from&gt;
              &lt;Hsp_hit-to&gt;44&lt;/Hsp_hit-to&gt;
              &lt;Hsp_query-frame&gt;-1&lt;/Hsp_query-frame&gt;
              &lt;Hsp_hit-frame&gt;0&lt;/Hsp_hit-frame&gt;
              &lt;Hsp_identity&gt;17&lt;/Hsp_identity&gt;
              &lt;Hsp_positive&gt;27&lt;/Hsp_positive&gt;
              &lt;Hsp_gaps&gt;0&lt;/Hsp_gaps&gt;
              &lt;Hsp_align-len&gt;33&lt;/Hsp_align-len&gt;
              &lt;Hsp_qseq&gt;LRGAICSMEHIEEALGKLKDWARKLIELLLGPR&lt;/Hsp_qseq&gt;
              &lt;Hsp_hseq&gt;ITGAVCLMDYLEKVLEKLRELAQKLIETLLGPQ&lt;/Hsp_hseq&gt;
              &lt;Hsp_midline&gt;+ GA+C M+++E+ L KL++ A+KLIE LLGP+&lt;/Hsp_midline&gt;
            &lt;/Hsp&gt;
          &lt;/Hit_hsps&gt;
        &lt;/Hit&gt;
&lt;/code&gt;
&lt;p&gt; and here is the perl script:&lt;/p&gt;
&lt;code&gt;#!/usr/local/bin/perl

# Usage information
#Usage: $0 -i &lt;BLAST-report-file&gt; -o &lt;output-file&gt; -n &lt;number-of-top-hits&gt; -b &lt;min_bit_score&gt;
# -t &lt;trashed.output_queries_without_hits&gt; 

use strict;
use warnings;
use Bio::SearchIO;
use Getopt::Std;#needed for flagging parameters

sub main{

my %opt;
#note: colons after letter mean the flag expects an argument
getopt('i:o:n:b:t:', \%opt);

print "Parsing the BLAST result ...\n";
my $in = Bio::SearchIO-&gt;new(-format =&gt; 'blastxml', -file =&gt; $opt{i});
open (OUT,"&gt;$opt{o}") or die "Cannot open $opt{o}: $!";
open (OUT2,"&gt;$opt{t}") or die "Cannot open $opt{t}: $!";
open (OUT3, "&gt;$opt{o}.header") or die "Cannot open $opt{o}.header: $!";


# print the header info for tab-deliminated columns
print OUT "query_name\tquery_length\taccession_number\tsubject_length\tsubject_description\tE value\tbit score\tframe\tquery_start\t";
print OUT "query_end\thit_start\thit_end\t%_conserved\t%_identical\n";

print OUT2 "query_name\tquery_length\taccession_number\tsubject_length\tsubject_description\tE value\tbit score\tframe\tquery_start\t";
print OUT2 "query_end\thit_start\thit_end\t%_conserved\t%_identical\n";


# extraction of information for each result recursively
while ( my $result = $in-&gt;next_result ) {

	#prints query info for reads WITHOUT hits into -t ="bad" file
   	if ( $result-&gt;num_hits == 0 ) {
		print OUT2 $result-&gt;query_description . "\t";
    	print OUT2 $result-&gt;query_length . "\t";
		print OUT2 "No hits found\n";
		
		}
	else {
		my $count = 0;
		# process each hit recursively
		while (my $hit = $result-&gt;next_hit) {



			#prints query info for reads WITH hits BELOW bit-score input value into -t = "bad" file
			if ( $hit-&gt;bits &lt; $opt{b}) {
   				print OUT2 $result-&gt;query_description . "\t";
    			print OUT2 $result-&gt;query_length . "\t";
				print OUT2 "below bit score\n";}
			#prints query and other info for reads WITH hits ABOVE bit-score input into -o = "good" file
			elsif (	$hit-&gt;bits &gt;= $opt{b}) {
   				print OUT $result-&gt;query_description . "\t";
   				print OUT3 $result-&gt;query_description . "\n";
    			print OUT $result-&gt;query_length . "\t";
        		print OUT $hit-&gt;accession . "\t";
       			print OUT $hit-&gt;length . "\t";
				print OUT $hit-&gt;description . "\t";
				print OUT $hit-&gt;significance . "\t";
				print OUT $hit-&gt;bits . "\t";
				
				my $hspcount = 0;
			
				# process the top HSP for the top number of hits (user defined) into -o file
				while (my $hsp = $hit-&gt;next_hsp) {
					if ($hit-&gt;bits &gt;= $opt{b}) {
						print OUT "\t\t\t\t\t\t\t", if ($hspcount &gt; 0);
          	      		print OUT $hsp-&gt;query-&gt;frame . "\t";
						print OUT $hsp-&gt;start('query') . "\t" . $hsp-&gt;end('query'). "\t";
						print OUT $hsp-&gt;start('hit') . "\t" . $hsp-&gt;end('hit') . "\t";
						printf OUT "%.1f" , ($hsp-&gt;frac_conserved * 100);
						print OUT "%\t";
						printf OUT "%.1f" , ($hsp-&gt;frac_identical * 100);
		       			print OUT "%\n";
           				$hspcount++;
            			}

            		}
            		
         	   }
			$count++;
			# flow control for the number of hits needed
			last if ($count == $opt{n});

		}
		
    	}
  	
}

close OUT;
close OUT2;

}
main();
print " DONE!!!\n";
&lt;/code&gt;

&lt;p&gt;and what I put into the command line:&lt;/p&gt;
&lt;code&gt;perl scriptname.pl -i inputfile.xml -o goodoutputfile.txt -d badoutputfile.txt -n number_of_hits_to_keep -b bit-score_cutoff&lt;/code&gt;

&lt;p&gt; and here is the output I get from "goodoutputfile.txt"&lt;/p&gt;
&lt;code&gt;query_name	query_length	accession_number	subject_length	subject_description	E value	bit score	frame	query_start	query_end	hit_start	hit_end	%_conserved	%_identical
HKUN3Y301D9XQX length=508 xy=1636_1159 region=1 run=R_2012_03_16_06_53_48_	508	1515029	65	(Cyanothece ATCC 51142)	0.00664016	40.0466	0	155	253	12	44	81.8%	51.5%
&lt;/code&gt;
</field>
</data>
</node>
