Ok, I got it to work (many thank again to hdb, Kenosis, and graff). I found a format from the NCBI called a GFF file worked much better for parsing. However, I couldn't get the suggested hash function to work, so my updated question is how would I get this to work using a hash instead and more resemble a Perl script than a cobbled Perl/C/something construct? As it stands, it functions, but it's inefficient and not elegant. (Truncated) Sample files also attached.
EDIT: I made one more modification, I finally got the hang of using a hash.
##gff-version 3
#!gff-spec-version 1.20
#!processor NCBI annotwriter
##sequence-region NC_001903.1 1 26498
##species http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=2
+24326
NC_001903.1 RefSeq region 1 26498 . + . ID=id0
+;Name=cp26;Dbxref=taxon:224326;Is_circular=true;gbkey=Src;genome=plas
+mid;mol_type=genomic DNA;plasmid-name=cp26;strain=B31
NC_001903.1 RefSeq gene 46 324 . + . ID=gene0;
+Name=BB_B01;Dbxref=GeneID:1194411;gbkey=Gene;locus_tag=BB_B01
NC_001903.1 RefSeq CDS 46 324 . + 0 ID=cds0;Na
+me=NP_046987.2;Parent=gene0;Note=catalyzes the hydrolysis of acylphos
+phate;Dbxref=Genbank:NP_046987.2,GeneID:1194411;gbkey=CDS;product=acy
+lphosphatase;protein_id=NP_046987.2;transl_table=11
NC_001903.1 RefSeq gene 308 751 . - . ID=gene1
+;Name=BB_B02;Dbxref=GeneID:1194420;gbkey=Gene;locus_tag=BB_B02
NC_001903.1 RefSeq CDS 308 751 . - 0 ID=cds1;N
+ame=NP_046988.1;Parent=gene1;Note=hypothetical protein%3B identified
+by Glimmer%3B putative;Dbxref=Genbank:NP_046988.1,GeneID:1194420;gbke
+y=CDS;product=hypothetical protein;protein_id=NP_046988.1;transl_tabl
+e=11
NC_001903.1 RefSeq gene 837 2186 . - . ID=gene
+2;Name=BB_B03;Dbxref=GeneID:1194419;gbkey=Gene;locus_tag=BB_B03
NC_001903.1 RefSeq CDS 837 2186 . - 0 ID=cds2;
+Name=NP_046989.1;Parent=gene2;Note=hypothetical protein%3B identified
+ by Glimmer%3B putative;Dbxref=Genbank:NP_046989.1,GeneID:1194419;gbk
+ey=CDS;product=hypothetical protein;protein_id=NP_046989.1;transl_tab
+le=11
NC_001903.1 RefSeq gene 2476 3798 . - . ID=gen
+e3;Name=BB_B04;Dbxref=GeneID:1194410;gbkey=Gene;locus_tag=BB_B04
NC_001903.1 RefSeq CDS 2476 3798 . - 0 ID=cds3
+;Name=NP_046990.2;Parent=gene3;Note=similar to GB:U07818 PID:466474 S
+P:Q45400 percent identity: 36.93%3B identified by sequence similarity
+%3B putative;Dbxref=Genbank:NP_046990.2,GeneID:1194410;gbkey=CDS;prod
+uct=chitibiose transporter protein ChbC;protein_id=NP_046990.2;transl
+_table=11
NC_001903.1 RefSeq gene 4084 4431 . + . ID=gen
+e4;Name=BB_B05;Dbxref=GeneID:1194409;gbkey=Gene;locus_tag=BB_B05
NC_001903.1 RefSeq CDS 4084 4431 . + 0 ID=cds4
+;Name=NP_046991.1;Parent=gene4;Note=similar to SP:P46319 PID:895750 P
+ID:1783268 GB:AL009126 percent identity: 27.62%3B identified by seque
+nce similarity%3B putative;Dbxref=Genbank:NP_046991.1,GeneID:1194409;
+gbkey=CDS;product=chitibiose transporter protein ChbA;protein_id=NP_0
+46991.1;transl_table=11
NC_001903.1 RefSeq gene 4482 4757 . + . ID=gen
+e5;Name=BB_B06;Dbxref=GeneID:1194408;gbkey=Gene;locus_tag=BB_B06
NC_001903.1 RefSeq CDS 4482 4757 . + 0 ID=cds5
+;Name=NP_046992.2;Parent=gene5;Note=similar to SP:P46318 PID:895748 P
+ID:1783266 GB:AL009126 percent identity: 46.32%3B identified by seque
+nce similarity%3B putative;Dbxref=Genbank:NP_046992.2,GeneID:1194408;
+gbkey=CDS;product=chitibiose transporter protein ChbB;protein_id=NP_0
+46992.2;transl_table=11
NC_001903.1 RefSeq gene 4769 5866 . + . ID=gen
+e6;Name=BB_B07;Dbxref=GeneID:1194407;gbkey=Gene;locus_tag=BB_B07
NC_001903.1 RefSeq CDS 4769 5866 . + 0 ID=cds6
+;Name=NP_046993.1;Parent=gene6;Note=similar to GB:M88764 SP:Q09090 PI
+D:469166 PID:1199570 percent identity: 21.29%3B identified by sequenc
+e similarity%3B putative;Dbxref=Genbank:NP_046993.1,GeneID:1194407;gb
+key=CDS;product=alpha3-beta1 integrin-binding protein;protein_id=NP_0
+46993.1;transl_table=11
NC_001903.1 RefSeq gene 5888 6517 . - . ID=gen
+e7;Name=BB_B08;Dbxref=GeneID:1194418;gbkey=Gene;locus_tag=BB_B08;pseu
+do=true
NC_001903.1 RefSeq gene 6677 7714 . + . ID=gen
+e8;Name=BB_B09;Dbxref=GeneID:1194417;gbkey=Gene;locus_tag=BB_B09
NC_001903.1 RefSeq CDS 6677 7714 . + 0 ID=cds7
+;Name=NP_046995.1;Parent=gene8;Note=hypothetical protein%3B identifie
+d by Glimmer%3B putative;Dbxref=Genbank:NP_046995.1,GeneID:1194417;gb
+key=CDS;product=hypothetical protein;protein_id=NP_046995.1;transl_ta
+ble=11
NC_001903.1 RefSeq gene 7836 8765 . + . ID=gen
+e9;Name=BB_B10;Dbxref=GeneID:1194406;gbkey=Gene;locus_tag=BB_B10
NC_001903.1 RefSeq CDS 7836 8765 . + 0 ID=cds8
+;Name=NP_046996.1;Parent=gene9;Note=similar to GP:1655797 percent ide
+ntity: 33.68%3B identified by sequence similarity%3B putative;Dbxref=
+Genbank:NP_046996.1,GeneID:1194406;gbkey=CDS;product=hypothetical pro
+tein;protein_id=NP_046996.1;transl_table=11
NC_001903.1 RefSeq gene 8781 9299 . + . ID=gen
+e10;Name=BB_B11;Dbxref=GeneID:1194405;gbkey=Gene;locus_tag=BB_B11;pse
+udo=true
NC_001903.1 RefSeq gene 9275 10036 . + . ID=ge
+ne11;Name=BB_B12;Dbxref=GeneID:1194404;gbkey=Gene;locus_tag=BB_B12
NC_001903.1 RefSeq CDS 9275 10036 . + 0 ID=cds
+9;Name=NP_046998.1;Parent=gene11;Note=similar to GP:2182756 percent i
+dentity: 46.00%3B identified by sequence similarity%3B putative;Dbxre
+f=Genbank:NP_046998.1,GeneID:1194404;gbkey=CDS;product=hypothetical p
+rotein;protein_id=NP_046998.1;transl_table=11
NC_001903.1 RefSeq gene 10104 10652 . + . ID=g
+ene12;Name=BB_B13;Dbxref=GeneID:1194403;gbkey=Gene;locus_tag=BB_B13
NC_001903.1 RefSeq CDS 10104 10652 . + 0 ID=cd
+s10;Name=NP_046999.1;Parent=gene12;Note=similar to GB:U03641 PID:4582
+18 percent identity: 42.59%3B identified by sequence similarity%3B pu
+tative;Dbxref=Genbank:NP_046999.1,GeneID:1194403;gbkey=CDS;product=hy
+pothetical protein;protein_id=NP_046999.1;transl_table=11
NC_001903.1 RefSeq gene 10920 11417 . - . ID=g
+ene13;Name=BB_B14;Dbxref=GeneID:1194402;gbkey=Gene;locus_tag=BB_B14
BB_B10
BB_B29
BB_B18
BB_B13
BB_B14
BB_B12
BB_B04
BB_B16
BB_B22
BB_B17
BB_B27
BB_B19
BB_B07
BB_B23
BB_B09
BB_B02
BB_B28
BB_B24
BB_B03
BB_B05
BB_B06
#!/usr/bin/perl -w
use Data::Dumper;
my @arrayOfVals;
open(my $tmp, "<", "/Users/bioinformatics/Desktop/NC_001903.gff.txt")
+|| die "Could not open $!";
LINE: while (<$tmp>) {
chomp;
next LINE if /^#/; # discard header, unneeded and will interfere
push(@arrayOfVals, $_);
}
close($tmp);
# input each line as an array
my @tmpArray;
open (my $arrVal, "<", "/Users/bioinformatics/Desktop/cp26_dff.txt") |
+| die "Could not open $!";
while (<$arrVal>) {
chomp;
push(@tmpArray, $_);
}
close($arrVal);
# same as above, each line of file is match string in array
my @tmpDictKeyArray;
my @tmpDictValueArray;
for (my $k=0; $k<$#arrayOfVals; $k++) {
$gffCompare = $arrayOfVals[$k];
$gffDescription = $arrayOfVals[$k+1];
# rationale: $gffCompare is the compare String,
# and the information I need will always follow one entry after in
+ the array
# if match is TRUE, then entry+1 will display the information
for (my $j=0; $j<$#tmpArray; $j++) {
$tmpArrayVal = $tmpArray[$j];
if ($gffCompare =~ /$tmpArrayVal/) {
if ($gffCompare =~ /.*;locus_tag=(.*)/) {
push(@tmpDictKeyArray, $1);
# Assign to array for hash
}
if ($gffDescription =~ /.*;Name=(.*);protein_/) {
push(@tmpDictValueArray, $1);
# see above
}
}
}
}
my %hashArray;
@hashArray{@tmpDictKeyArray} = @tmpDictValueArray;
print Dumper(\%hashArray);
|