open (IN,$tax2locus_file); while(){ my($taxid,$locus)=split(/\t/,$_); $tax2loc{$locus}=$taxid; } close(IN); print "there are\t".scalar(keys %tax2loc)."\tlocus_ids as key in hash\n"; ############### Now read in sharedTab file with pairwise overlap info my $sharedTab_file=$ARGV[0]; my @columns; my $prophageA; my $prophageB; my $outfile="$sharedTab_file.hostinfo"; my $hostA; my $PFnumA; my $hostB; my $PFnumB; my $regex; my $matching_key; my $taxidA; my $taxidB; open (OUT,">$outfile"); open(IN,$sharedTab_file); print OUT "#prophageA\tprophageB\thostA\ttaxidA\thostB\ttaxidB\tjacc\n"; while(){ chomp; next if (/^#/); # ignore comments @columns=split(/\t/,$_); $prophageA=$columns[0]; ($hostA,$PFnumA)=split(/\./,$prophageA); if ($hostA =~ /^NZ/){ ## for wgs genomes just match first 7 characters as only NZ_XXXX000000 are in tax2locus my $hostA=substr $hostA, 0, 7; } $regex=qr/$hostA/; $matching_key=grep { $_ =~ /$regex/ } keys %tax2loc; $taxidA=$tax2loc{$matching_key}; $prophageB=$columns[1]; ($hostB,$PFnumB)=split(/\./,$prophageB); if ($hostB =~ /^NZ/){ ## for wgs genomes just match first 7 characters as only NZ_XXXX000000 are in tax2locus my $hostB=substr $hostB, 0, 7; } $regex=qr/$hostB/; $matching_key=grep { $_ =~ /$regex/ } keys %tax2loc; $taxidB=$tax2loc{$matching_key}; my $jacc=$columns[5]; print OUT join("\t",$prophageA,$prophageB,$hostA,$taxidA,$hostB,$taxidB,$jacc)."\n";