use strict; use warnings; open my $pdb_fh, '<', "6U9D.pdb" or die "unable to open 6U9D.pdb for reading $!"; my %amino_acid_conversion = ( ALA => 'A', ARG => 'R', ASN => 'N', ASP => 'D', CYS => 'C', GLN => 'Q', GLU => 'E', GLY => 'G', HIS => 'H', ILE => 'I', LEU => 'L', LYS => 'K', MET => 'M', PHE => 'F', PRO => 'P', SER => 'S', THR => 'T', TRP => 'W', TYR => 'Y', VAL => 'V'); my @atoms; #2D array of x,y,z of all atoms (many thousands) my @seqs; #raw sequences with the 3 letter designations #convert later to FASTA format my $cur_seq = ''; my $cur_ltr = ''; # parse out data of interest from file # while (<$pdb_fh>) { if (my ($ltr, $seq) = (/^SEQRES\s+\d+\s+(\w+)\s+\d+\s+([A-Z ]+)$/) ) { $seq =~ s/\s+$//; if ($ltr eq $cur_ltr) { $cur_seq .= " $seq"; } else { push @seqs, $cur_seq if $cur_seq ne ''; # end of current sequence $cur_seq = $seq; # begin of the next sequence $cur_ltr = $ltr; } } elsif (my ($x,$y,$z) = (/^ATOM\s+.*?([\d.-]+)\s+([\d.-]+)\s+([\d.-]+)/) ) { push @atoms, [$x,$y,$z]; } } push @seqs, $cur_seq; # don't forget to finish the last seq! #### output collected data ### #make a fasta sequence segments foreach my $seq (@seqs) { # my $fasta = join '',map{$amino_acid_conversion{$_}}split ' ',$seq; # without using a map: # my $fasta =''; foreach my $char3 (split ' ',$seq) { $fasta.= $amino_acid_conversion{$char3} } print ">Some Fasta Description Line\n"; #use 60 char lines while ($fasta) #fasta suggested max is 80 { print substr($fasta,0,60,''),"\n"; } } #print the data points # I am not sure what needs to be done with them # average of each coordinate? #foreach my $row_ref (@atoms) #uncomment to print #{ # print @$row_ref,"\n"; #} my $xsum; my $ysum; my $zsum; foreach my $row_ref (@atoms) # @atoms is a 2D array { my ($x, $y , $z ) = @$row_ref; $xsum+=$x; $ysum+=$y; $zsum+=$z; } print "avg x = ",$xsum/@atoms,"\n"; print "avg y = ",$ysum/@atoms,"\n"; print "avg z = ",$zsum/@atoms,"\n"; __END__ These are 2 examples: You will have to figure out what goes in the FASTA description line And perhaps not all of these sequences are relevant? Looks like a lot are duplicates. >Some Fasta Description Line MHHHHHHENLYFQGAPSFNVDPLEQPAEPSKLAKKLRAEPDMDTSFVGLTGGQIFNEMMS RQNVDTVFGYPGGAILPVYDAIHNSDKFNFVLPKHEQGAGHMAEGYARASGKPGVVLVTS GPGATNVVTPMADAFADGIPMVVFTGQVPTSAIGTDAFQEADVVGISRSCTKWNVMVKSV EELPLRINEAFEIATSGRPGPVLVDLPKDVTAAILRNPIPTKTTLPSNALNQLTSRAQDE FVMQSINKAADLINLAKKPVLYVGAGILNHADGPRLLKELSDRAQIPVTTTLQGLGSFDQ EDPKSLDMLGMHGCATANLAVQNADLIIAVGARFDDRVTGNISKFAPEARRAAAEGRGGI IHFEVSPKNINKVVQTQIAVEGDATTNLGKMMSKIFPVKERSEWFAQINKWKKEYPYAYM EETPGSKIKPQTVIKKLSKVANDTGRHVIVTTGVGQHQMWAAQHWTWRNPHTFITSGGLG TMGYGLPAAIGAQVAKPESLVIDIDGDASFNMTLTELSSAVQAGTPVKILILNNEEQGMV TQWQSLFYEHRYSHTHQLNPDFIKLAEAMGLKGLRVKKQEELDAKLKEFVSTKGPVLLEV EVDKKVPVLPMVAGGSGLDEFINFDPEVERQQTELRHKRTGGKH >Some Fasta Description Line MGSSHHHHHHSSGLVPRGSHMENLYFQGATRPPLPTLDTPSWNANSAVSSIIYETPAPSR QPRKQHVLNCLVQNEPGVLSRVSGTLAARGFNIDSLVVCNTEVKDLSRMTIVLQGQDGVI EQARRQIEDLVPVYAVLDYTNSEIIKRELVMARISLLGTEYFEDLLLHHHTSTNAGAADS QELVAEIREKQFHPANLPASEVLRLKHEHLNDITNLTNNFGGRVVDISETSCIVELSAKP TRISAFLKLVEPFGVLECARSGMMALPRTPLKTSTEEAADEDEKISEIVDISQLPPG I have no idea what these numbers would mean? avg x = 321.013155298296 avg y = 290.744642162734 avg z = 69.196842162731