#!/usr/bin/perl -w ################################################################ # Programmer: biograd # Program: TIGRFillTables.plx # Date begun: 8/17/01 # Description: This is a perl script made to automatically take # information from previously generated files # containing _Arabidopsis thaliana_ genomic data # from the TIGR database and place it into a local # mysql database. Since it is intended for automation, # a user name and password are hard-coded. # Input: -Files formatted by Dr. Johns, such as # /arabidopsis/bac_info_08102001.txt and # /arabidopsis/gene_info_08102001.txt # Effect: -Successful: information will be parsed from the # files and loaded into correct database areas. # # Outline: I. CONNECT # II. ACHROMS # A. Prepare statements to be used # B. Loop through each bacfile record # 1. CleanData() # 2. ParseAChroms() # 3. Check for matching record in db # a. If match, then update record # b. If no match, insert record # III. GENES, RELATEGAC, EXONS # A. Prepare statements for filling: # 1. Genes # 2. RelateGAC (genes to artificial chroms) # 3. Exons # B. Loop through each genefile record # 1. CleanData() # 2. ParseGenes() # 3. Check for matching gene record in db # a. If match, then update gene # b. If no match, insert gene info # 4. Check if an AC is listed in the gene info # a. If yes, check relateGAC table for an # exact match of gene with this AC. # 1) If match, increment count by 1 # 2) If no match, insert relation into # relateGAC table. # 5. ParseExon() # 6. Loop through dereferenced exon arrays # a. Check for matching exon record in db # 1) If match, update exon # 2) If no match, insert exon info # b. Check if sub_exon data is there # 1) If match, update sub-exon # 2) If no match, insert sub_exon # IV. DISCONNECT # A. Finish statements # B. Disconnect from database # V. SUBROUTINE DEFINITIONS # A. CleanData() # 1. Removes empty lines and newlines # 2. Returns array of good lines # B. ParseAChroms() # 1. Searches for information # 2. Plugs information into an array # 3. Returns array # C. ParseGenes() # 1. Searches for information # 2. Plugs information into an array # 3. Returns array # D. ParseExons() # 1. Searches for each exon's info # 2. Forms array for each exon # 3. Returns array of array references ################################################################ use strict; use DBI; my ($dbh, $sth_genes, $sth_achroms, $sth_rgac, $record, $sth_find_match, $sth_getcount, @count, $geneid, $current_bac, $sth_select_gene_id, $sth_updatecount, $sth_match_gene, $sth_update_genes, @record, $line, $sth_match_achrom, $sth_update_achroms, @newrecord, @gene, @achrom, @rgac, @temp, $temp, $n, @exondata, $sth_find_exon, $sth_find_subexon, $sth_insert_exon, $sth_insert_subexon, $sth_update_exon, $sth_update_subexon); $n = 0; ### CONNECT ###################################################### $dbh = DBI->connect( "DBI:mysql:arabidopsis", "", "", { PrintError => 0, RaiseError => 1}) or die "\nCouldn't connect to database: $DBI::errstr\n"; ### PREPARE SQL FOR ACHROMS TABLE #################################### # This statement is sent to the database to be parsed and planned # before any data is touched. This move saves time in the long run. # Question marks are where data will be "filled in" or bound during # each execution of this statement in the loop. $sth_achroms = $dbh -> prepare( " INSERT INTO achroms (type, ac_id, length, chr_num, chr_strt, chr_end, bac_strt, bac_end, orient) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?) "); $sth_match_achrom = $dbh -> prepare(" SELECT ac_id FROM achroms WHERE ac_id = ? "); # This statement is run incase an artificial chromosome is already # listed in the database but could have new data to be added # or changed. $sth_update_achroms = $dbh -> prepare(" UPDATE achroms SET type = ?, length = ?, chr_num = ?, chr_strt = ?, chr_end = ?, bac_strt = ?, bac_end = ?, orient = ? WHERE ac_id = ? "); ### OPEN ACHROMS DATAFILE ########################################### # These were my test files. "bac.ht" is a smaller file that holds # the head +130 and tail +130 of the "bac_info_08102001.txt" file. # open (INFILE, "bac_info_08102001.txt") or die "Couldn't open bacfile:$1"; open (INFILE, "bac.ht") or die "Couldn't open bacfile: $1"; ### FILL ACHROMS TABLE ########################################### # Take in whole records separated by five dashes instead of a # newline character. $/ = "-----"; while($record = ) { # Print out a counter to show how far the filling has progressed. # print "$n\n"; # $n++; @newrecord = (); # Send the record to have any blank lines removed. @newrecord = CleanData( $record ); # If there is an array returned from CleanData(), parse # it. This removes the chance of empty records causing # extra null records in the database. @achrom = (); if (@newrecord) { # Send the clean record to be parsed for Artificial # Chromosome data only. @achrom = ParseAChroms( \@newrecord ); $sth_match_achrom -> execute( $achrom[1] ); $temp = $sth_match_achrom -> fetchrow_array(); if ( $temp ) # if there's a match, update data { $sth_update_achroms -> execute( $achrom[0], $achrom[2], $achrom[3], $achrom[4], $achrom[5], $achrom[6], $achrom[7], $achrom[8], $achrom[1] ); } else # if no match, insert data { # Bind (fill in) the question marks in the prepared insert # statement. The bind data is from the returned @achrom elements. $sth_achroms -> execute( $achrom[0],$achrom[1],$achrom[2], $achrom[3], $achrom[4], $achrom[5], $achrom[6], $achrom[7], $achrom[8]); } } } close INFILE; ### END ACHROMS TABLE FILL ######################################### ### PREPARE SQL FOR GENES, RELATEGAC, AND EXONS TABLES ################ # Question marks are where data will be "filled in" or bound during # each execution of these statements in the loop. # This statement will insert the bind data into the genes table # in the fields listed. # GENES This statement will try to select the present tigr chromosomal # reference number for the current gene. If it is present, then the # UPDATE statment needs to be used instead of the INSERT. $sth_match_gene = $dbh -> prepare(" SELECT gene_id FROM genes WHERE tigr_id = ? "); # GENES This statement puts new data into a new record. $sth_genes = $dbh -> prepare( " INSERT INTO genes (chrom_num, tigr_id, bac_locus, pseudogene, chrom_locus, locus_start, locus_end, orientation, read_frame, exon_count, product, notes, ac_id) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" ); # GENES This is the update statement to be used when a gene is already # listed but new data about it might be provided. $sth_update_genes = $dbh -> prepare(" UPDATE IGNORE genes SET chrom_num = ?, bac_locus = ?, pseudogene = ?, chrom_locus = ?, locus_start = ?, locus_end = ?, orientation = ?, read_frame = ?, exon_count = ?, product = ?, notes = ?, ac_id = ? WHERE tigr_id = ? "); # GENES Just to make things easier, find the unique gene number that # goes to the current tigr gene reference. $sth_select_gene_id = $dbh -> prepare( " SELECT gene_id FROM genes WHERE chrom_locus = ? "); # RELATEGAC This statement will find whether a pre-existing relationship # of the kind specified by the bound data is there. The success # can be tested. $sth_find_match = $dbh -> prepare(" SELECT gene_id, ac_id FROM relateGAC WHERE gene_id = ? AND ac_id = ? "); # RELATEGAC This statement will insert related unique gene numbers given # by the database and the names of artificial chromosomes. # The "count" field is to be increased any time a duplicate # relationship is found. For the insert function, it will # always be "1". It will be increased if necessary with an update # statement within the loop. $sth_rgac = $dbh -> prepare( " INSERT INTO relateGAC (gene_id, ac_id, count) VALUES ( ?, ?, 1 ) "); # RELATEGAC This statement will select the count data from the # genes/artificial chromosome relational table ("relateGAC") # where the unique gene id number is the same as that in # the requested bound data. $sth_getcount = $dbh -> prepare(" SELECT count FROM relateGAC WHERE gene_id = ? "); # RELATEGAC This statement will allow the count field to be updated # to a given bound integer when the unique gene id number # matches the given bound data. $sth_updatecount = $dbh -> prepare(" UPDATE relateGAC SET count = ? WHERE gene_id = ? "); # EXONS $sth_find_exon = $dbh -> prepare(" SELECT gene_id, exon_num FROM exons WHERE gene_id = ? AND exon_num = ? "); $sth_find_subexon = $dbh -> prepare(" SELECT sub_type, sub_locus_start FROM subexons WHERE gene_id = ? AND exon_num = ? AND sub_type = ? "); $sth_update_exon = $dbh ->prepare(" UPDATE IGNORE exons SET type = ?, tigr_id = ?, locus_start = ?, locus_end = ? WHERE gene_id = ? AND exon_num = ?"); $sth_update_subexon = $dbh -> prepare(" UPDATE IGNORE subexons SET sub_locus_start = ?, sub_locus_end = ? WHERE gene_id = ? AND exon_num = ? AND sub_type = ? "); $sth_insert_exon = $dbh -> prepare(" INSERT INTO exons (gene_id, exon_num, type, tigr_id, locus_start, locus_end) VALUES ( ?,?,?,?,?,? ) "); $sth_insert_subexon = $dbh -> prepare(" INSERT INTO subexons (gene_id, exon_num, sub_type, sub_locus_start, sub_locus_end) VALUES ( ?,?,?,?,? ) "); ### OPEN GENE DATAFILE ######################################### # These are the test chromosome data files. They have the same # format as the bac files did above. # open (INFILE, "gene_info_08102001.txt") or die "Couldn't open chrfile:$1"; open (INFILE, "genes.ht") or die "Couldn't open infile: $1"; ### FILL GENES TABLE ######################################### # Reset the counter to zero so we can see how many gene records # are processed. Separator is set to five dashes. $n = 0; $/ = "-----"; while($record = ) { print "\nz$n\n"; $n++; @newrecord = (); @newrecord = CleanData( $record ); # Remove newlines from data. @gene=(); # If there is an array returned from CleanData(), parse # it. This removes the chance of empty records causing # extra null records in the database. if ( @newrecord ) { @gene = ParseGenes( \@newrecord ); $sth_match_gene -> execute( $gene[1] ); $temp = $sth_match_gene -> fetchrow_array(); if ( $temp ) { # Execute the update statement, since this tigr # reference number appears to be listed already. # The bind data are the returned elements from the # ParseGenes() subroutine. The ref# is not updated. $sth_update_genes -> execute ( $gene[0], $gene[2], $gene[3], $gene[4], $gene[5], $gene[6], $gene[7], $gene[8], $gene[9], $gene[10], $gene[11], $gene[12], $gene[1] ); } else { # Execute the previously made statement to insert the data # from this record into the genes table. $sth_genes -> execute( $gene[0], $gene[1], $gene[2], $gene[3], $gene[4], $gene[5], $gene[6], $gene[7], $gene[8], $gene[9], $gene[10], $gene[11], $gene[12] ); } } ### END GENES TABLE FILL ######################################### ### FILL RELATEGAC TABLE ######################################### # If there is a bac associated with this gene, insert the # association into the relateGAC table. The gene[12] element # is the bac name from the data. if( $gene[12] ) { $current_bac = $gene[12]; # Find the unique gene id number for the current tigr # gene by matching the locus name (element 4 of the gene # array from parsing). Usually, this would be the last # entered gene, but there may be duplicates. This method # should avoid duplication. $sth_select_gene_id -> execute ( $gene[4] ); @temp = (); @temp = $sth_select_gene_id -> fetchrow_array(); $geneid = $temp[0]; # If this statement is able to execute, continue. if ($sth_find_match-> execute( $geneid, $current_bac ) ) { # Get the data from the matching row. $temp = 0; $temp = $sth_find_match -> fetchrow_array(); # If any matching rows are found, update the count. # Do not insert another row. if ( $temp ) { @count = (); $sth_getcount -> execute( $geneid ); @count = $sth_getcount -> fetchrow_array(); $count[0]++; print "Gene id number is $geneid.\nCount is : $count[0]\n"; $sth_updatecount -> execute ( $count[0], $geneid ); } # If there are no matches (@temp is undefined), insert # the new row, with a count of "1" automatically entered # from the prepare statement, and the last used unique # gene id number. else { $sth_rgac -> execute( $sth_genes->{mysql_insertid} , $current_bac ); } } else { print "\n\$sth_find_match was unable to execute,\n $current_bac from gene $geneid was not processed : $DBI::errstr\n"; } # End if loop for insertion/count update routine } # End if loop for $gene[12] test (BAC name existance) ### END RELATEGAC TABLE FILL################################ ### FILL EXONS TABLE######################################## if ( @newrecord ) { foreach $line ( @newrecord ) { @exondata=(); next unless $line=~m/EXON\b/; if ( $line=~m/^\.{5}/ ) # line contains exon data { # Elements are ( exon_num, type, tigr_id, locus_start, locus_end ) @exondata=ParseExon( $line ); $sth_find_exon -> execute( $geneid, $exondata[0] ); if ( $sth_find_exon -> fetchrow_array() ) { $sth_update_exon -> execute( $exondata[1],$exondata[2], $exondata[3],$exondata[4], $geneid, $exondata[0] ); # print "Updated gene $geneid 's exon num $exondata[0]\n"; } else { $sth_insert_exon -> execute( $geneid, $exondata[0], $exondata[1], $exondata[2], $exondata[3], $exondata[4] ); # print "Inserted gene $geneid 's exon num $exondata[0]\n"; } } else { # Elements will be ( exon_num, type, locus_start, locus_end ) @exondata = ParseSubExon( $line ); $sth_find_subexon -> execute( $geneid, $exondata[0], $exondata[1] ); if ( $sth_find_subexon -> fetchrow_array() ) { $sth_update_subexon -> execute( $exondata[2], $exondata[3], $geneid, $exondata[0], $exondata[1]); # print "Updated gene $geneid 's exon $exondata[0] 's subexon (type =$ } else { $sth_insert_subexon -> execute( $geneid, $exondata[0], $exondata[1], $exondata[2], $exondata[3] ); # print "Inserted gene $geneid 's exon $exondata[0] 's subexon (type $ } } } # End foreach block for @newrecord } # End exon handling } # End this record's entire while loop close INFILE; ### END EXONS TABLE FILL ######################################### ### DISCONNECT NICELY ############################################ # Finishing these statements is not absolutely necessary, but # it avoids an error. Sometimes the statements are not used. # If the statement isn't executed, then it's memory won't be # deallocated until the program ends ( which causes an error). $sth_match_achrom -> finish(); $sth_update_achroms -> finish(); $sth_match_gene -> finish(); $sth_update_genes -> finish(); $sth_find_match -> finish(); $sth_getcount -> finish(); $sth_updatecount -> finish(); $sth_select_gene_id -> finish(); $sth_find_exon -> finish(); $sth_find_subexon -> finish(); $dbh->disconnect or die "Couldn't disconnect from database: $DBI::errstr\n"; exit; ### SUBROUTINES START HERE ####################################### sub CleanData { my ($record, @cleanrecord, $line, @record); $record = $_[0]; ### Separate the record by newlines. @record = split /\n/, $record; ### Remove empty lines if they exist. foreach $line ( @record ) { chomp( $line ); next if ( $line =~ m/^\s*$/ ); push @cleanrecord, $line; } return ( @cleanrecord ); } ### PARSE ARTIFICIAL CHROMOSOMES ############################ sub ParseAChroms { my (@achrom, $line, $type, $ac_id, $length, $chr_num, $chr_strt, $chr_end, $bac_strt, $bac_end, $orient); my $achromref = $_[0]; foreach $line (@$achromref) { # Get type. Right now, all are BAC's. if ($line =~ m/\b((B|Y)AC)\b/) { $type = $1; } # Get reference number. if ($line =~ m/BAC:\s*(.*)\s*$/) { $ac_id = $1; } # Get artificial chromosome length if ($line =~ m/LENGTH:\s*(.*)\s*$/) { $length = $1; } # Get associated chromosome number if ($line =~ m/OSOME:\s*(.*)\s*$/) { $chr_num = $1; } # Get associated chromosome start base. if ($line =~ m/CHR_START:\s*(.*)\s*$/) { $chr_strt = $1; } # Get associated chromosome ending base. if ($line =~ m/CHR_END:\s*(.*)\s*$/) { $chr_end = $1; } # Get artificial chromosome start base. if ($line =~ m/BAC_START:\s*(.*)\s*$/) { $bac_strt = $1; } # Get artificial chromosome ending base. if ($line =~ m/BAC_END:\s*(.*)\s*$/) { $bac_end = $1; } # Get associate chromosome's orientation on the ac. if ($line =~ m/ORIENTATION:\s*(\w)\w*\s*$/) { $orient = $1; } } push @achrom, $type, $ac_id, $length, $chr_num, $chr_strt, $chr_end, $bac_strt, $bac_end, $orient; return @achrom; } ########## PARSE GENES ########################################## sub ParseGenes { my ( $line, @gene, $chrome_num, $bac, $tigr_id, $bac_locus, $chromo_locus, $model_begin, $model_end, $orient, $read_frame, $num_exons, $prod, $notes, $pseudo); my $generef = $_[0]; foreach $line (@$generef) { # Get chromosome number if ($line =~ m/CHROMOSOME:\s*(\d*)\s*$/) { $chrome_num = $1; } # Get TIGR model's reference id if ($line =~ m/MODEL_NAME:\s*(.*)$/) { $tigr_id = $1; } # Get the locus on the bac where the gene was cloned. if ($line =~ m/BAC_LOCUS:\s*(.*)\s*$/) { $bac_locus = $1; } # Get full name if ($line =~ m/COMMON_NAME:\s*(.*)$/) { $prod = $1; } if ( $line =~ m/PSEUDOGENE:\s*(\w*)\s*$/) { $pseudo = $1; } # Get chromosome locus identifier if ($line =~ m/CHROMO_LOCUS:\s*(.*)\s*$/) { $chromo_locus = $1; } # Get starting place if ($line =~ m/MODEL_BEGIN:\s*(\d+)\s*$/) { $model_begin = $1; } # Get ending place if ($line =~ m/MODEL_END:\s*(\d+)\s*$/) { $model_end = $1; } # Get whether forward or reverse if ($line =~ m/ORIENTATION:\s*(\w)_(\d)\s*$/) { $orient = $1; $read_frame = $2; } # Get number of exons in the gene if ($line =~ m/NUM_EXONS:\s*(\d+)\s*$/) { $num_exons = $1; } # Get any notes attached to the gene if ($line =~ m/COMMENT:\s*(.*)$/) { $notes = $1; } # Get the bac where the gene was cloned. if ($line =~ m/BAC:\s*(.*)\s*$/) { $bac = $1; } } push @gene, $chrome_num, $tigr_id, $bac_locus, $pseudo, $chromo_locus, $model_begin, $model_end, $orient, $read_frame, $num_exons, $prod, $notes, $bac; return @gene; } ########## PARSE EXONS ########################################## sub ParseExon { my $line = $_[0]; my @exon=(); # Get exon's order number. $line=~m/EXON\s(\d+)/; push @exon, $1; # Get the exon type from inside the parentheses $line=~m/\((.*)\)/; push @exon, $1; # Get TIGR's model number $line=~m/\b(\d*\.e\d*)\b/; push @exon, $1; # Get start position $line=~m/pos:\s*\b(\d*)\b/; push @exon, $1; # Get end position $line=~m/\b(\d*)\s*$/; push @exon, $1; return (@exon); } ########## PARSE SUB EXONS ########################################## sub ParseSubExon { my $line = $_[0]; my @subexon=(); # Get parent exon's order number. $line=~m/EXON\s(\d+)/; push @subexon, $1; # Get the subexon type $line=~m/\d\s*:\s*(.*)\s*:\s*pos/; push @subexon, $1; # Get start position $line=~m/pos:\s*(\d*)\b/; push @subexon, $1; # Get end position $line=~m/\d-\s*(\d*)\s*$/; push @subexon, $1; return ( @subexon ); }