#!/usr/bin/perl -w

################################################################
# Programmer:   biograd
# Program:      TIGRFillTables.plx
# Date begun:   8/17/01
# Description:  This is a perl script made to automatically take
#               information from previously generated files
#               containing _Arabidopsis thaliana_ genomic data
#               from the TIGR database and place it into a local
#               mysql database. Since it is intended for automation,
#               a user name and password are hard-coded.
# Input:        -Files formatted by Dr. Johns, such as
#                /arabidopsis/bac_info_08102001.txt  and
#                /arabidopsis/gene_info_08102001.txt
# Effect:       -Successful: information will be parsed from the
#                files and loaded into correct database areas.
#                
# Outline:      I.   CONNECT
#               II.  ACHROMS
#                    A. Prepare statements to be used
#                    B. Loop through each bacfile record
#                       1. CleanData()
#                       2. ParseAChroms()
#                       3. Check for matching record in db
#                          a. If match, then update record
#                          b. If no match, insert record
#               III. GENES, RELATEGAC, EXONS
#                    A. Prepare statements for filling:
#                       1. Genes
#                       2. RelateGAC (genes to artificial chroms)
#                       3. Exons
#                    B. Loop through each genefile record
#                       1. CleanData()
#                       2. ParseGenes()
#                       3. Check for matching gene record in db
#                          a. If match, then update gene  
#                          b. If no match, insert gene info
#                       4. Check if an AC is listed in the gene info
#                          a. If yes, check relateGAC table for an
#                             exact match of gene with this AC.
#                             1) If match, increment count by 1
#                             2) If no match, insert relation into
#                                relateGAC table.
#                       5. ParseExon()
#                       6. Loop through dereferenced exon arrays
#                         a.  Check for matching exon record in db
#                              1) If match, update exon
#                              2) If no match, insert exon info
#			  b.  Check if sub_exon data is there
#			       1) If match, update sub-exon
#			       2) If no match, insert sub_exon
#               IV.  DISCONNECT
#                    A. Finish statements
#                    B. Disconnect from database
#               V.   SUBROUTINE DEFINITIONS
#                    A. CleanData()
#                       1. Removes empty lines and newlines
#                       2. Returns array of good lines
#                    B. ParseAChroms()
#                       1. Searches for information
#                       2. Plugs information into an array
#                       3. Returns array
#                    C. ParseGenes()
#                       1. Searches for information
#                       2. Plugs information into an array
#                       3. Returns array
#                    D. ParseExons()
#                       1. Searches for each exon's info
#                       2. Forms array for each exon
#                       3. Returns array of array references
################################################################

use strict;
use DBI;
my ($dbh, $sth_genes, $sth_achroms, $sth_rgac, $record,
    $sth_find_match, $sth_getcount, @count, $geneid,
    $current_bac, $sth_select_gene_id, $sth_updatecount,
    $sth_match_gene, $sth_update_genes, @record, $line,   
    $sth_match_achrom, $sth_update_achroms,
    @newrecord, @gene, @achrom, @rgac, @temp, $temp, $n,
    @exondata, $sth_find_exon, $sth_find_subexon,  
    $sth_insert_exon, $sth_insert_subexon,
    $sth_update_exon, $sth_update_subexon);

$n = 0;

### CONNECT ######################################################


  $dbh = DBI->connect( "DBI:mysql:arabidopsis", "", "",
         { PrintError => 0, RaiseError => 1})
         or die "\nCouldn't connect to database: $DBI::errstr\n";


### PREPARE SQL FOR ACHROMS TABLE ####################################

  # This statement is sent to the database to be parsed and planned
  # before any data is touched. This move saves time in the long run.
  # Question marks are where data will be "filled in" or bound during
  # each execution of this statement in the loop.
  $sth_achroms = $dbh -> prepare( "
                INSERT INTO achroms (type, ac_id, length,
                chr_num, chr_strt, chr_end, bac_strt, bac_end,
                orient) VALUES ( ?, ?, ?, ?, ?, ?, ?, ?,
                ?) ");

  $sth_match_achrom = $dbh -> prepare("
                SELECT ac_id
                FROM achroms
                WHERE ac_id = ? ");

  # This statement is run incase an artificial chromosome is already  
  # listed in the database but could have new data to be added
  # or changed.
  $sth_update_achroms = $dbh -> prepare("
                UPDATE achroms
                SET type = ?, length = ?, chr_num = ?, chr_strt = ?,
                chr_end = ?, bac_strt = ?, bac_end = ?, orient = ?
                WHERE ac_id = ? ");


### OPEN ACHROMS DATAFILE ###########################################

  # These were my test files. "bac.ht" is a smaller file that holds
  # the head +130 and tail +130 of the "bac_info_08102001.txt" file.
#  open (INFILE, "bac_info_08102001.txt") or die "Couldn't open bacfile:$1";
  open (INFILE, "bac.ht") or die "Couldn't open bacfile: $1";

### FILL ACHROMS TABLE ###########################################    

  # Take in whole records separated by five dashes instead of a
  # newline character.
  $/ = "-----";
  while($record = <INFILE>)
  {
    # Print out a counter to show how far the filling has progressed.
#    print "$n\n";
#    $n++;

    @newrecord = ();
    # Send the record to have any blank lines removed.
    @newrecord = CleanData( $record );

    # If there is an array returned from CleanData(), parse  
    # it. This removes the chance of empty records causing
    # extra null records in the database.
    @achrom = ();
    if (@newrecord)
    {
      # Send the clean record to be parsed for Artificial
      # Chromosome data only.
      @achrom = ParseAChroms( \@newrecord );

      $sth_match_achrom -> execute( $achrom[1] );
      $temp = $sth_match_achrom -> fetchrow_array();

      if ( $temp )                      # if there's a match, update data
      {
        $sth_update_achroms -> execute( $achrom[0], $achrom[2],
                $achrom[3], $achrom[4], $achrom[5], $achrom[6],
                $achrom[7], $achrom[8], $achrom[1] );
      }
      else                              # if no match, insert data
      {
        # Bind (fill in) the question marks in the prepared insert
        # statement. The bind data is from the returned @achrom elements.
        $sth_achroms -> execute( $achrom[0],$achrom[1],$achrom[2],
                     $achrom[3], $achrom[4], $achrom[5], $achrom[6],
                     $achrom[7], $achrom[8]);
      }
    }
  }
  close INFILE;

### END ACHROMS TABLE FILL #########################################

### PREPARE SQL FOR GENES, RELATEGAC, AND EXONS TABLES ################

  # Question marks are where data will be "filled in" or bound during
  # each execution of these statements in the loop.
  # This statement will insert the bind data into the genes table
  # in the fields listed.

  # GENES This statement will try to select the present tigr chromosomal
  # reference number for the current gene. If it is present, then the
  # UPDATE statment needs to be used instead of the INSERT.
  $sth_match_gene = $dbh -> prepare("
                SELECT gene_id
                FROM genes
                WHERE tigr_id = ? ");

  # GENES This statement puts new data into a new record.
  $sth_genes = $dbh -> prepare( "
                INSERT INTO genes
                (chrom_num, tigr_id, bac_locus, pseudogene,
                chrom_locus, locus_start, locus_end, orientation,
                read_frame, exon_count, product, notes, ac_id)
                VALUES ( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )" );

  # GENES This is the update statement to be used when a gene is already
  # listed but new data about it might be provided.
  $sth_update_genes = $dbh -> prepare("
                UPDATE IGNORE genes
                SET chrom_num = ?, bac_locus = ?, pseudogene = ?,
                chrom_locus = ?, locus_start = ?, locus_end = ?,
                orientation = ?, read_frame = ?, exon_count = ?,
                product = ?, notes = ?, ac_id = ?
                WHERE tigr_id = ? ");

  # GENES Just to make things easier, find the unique gene number that
  # goes to the current tigr gene reference.
  $sth_select_gene_id = $dbh -> prepare( "
                SELECT gene_id
                FROM genes
                WHERE chrom_locus = ? ");

  # RELATEGAC This statement will find whether a pre-existing relationship
  # of the kind specified by the bound data is there. The success
  # can be tested.
  $sth_find_match = $dbh -> prepare("
                SELECT gene_id, ac_id FROM relateGAC
                WHERE gene_id = ?  AND ac_id = ? ");

  # RELATEGAC This statement will insert related unique gene numbers given
  # by the database and the names of artificial chromosomes.
  # The "count" field is to be increased any time a duplicate
  # relationship is found. For the insert function, it will
  # always be "1". It will be increased if necessary with an update
  # statement within the loop.
  $sth_rgac = $dbh -> prepare( "
                INSERT INTO relateGAC (gene_id, ac_id, count)
                VALUES ( ?, ?, 1 ) ");

  # RELATEGAC This statement will select the count data from the
  # genes/artificial chromosome relational table ("relateGAC")   
  # where the unique gene id number is the same as that in
  # the requested bound data.
  $sth_getcount = $dbh -> prepare("
                SELECT count FROM relateGAC
                WHERE gene_id = ? ");

  # RELATEGAC This statement will allow the count field to be updated
  # to a given bound integer when the unique gene id number  
  # matches the given bound data.
  $sth_updatecount = $dbh -> prepare("
                UPDATE relateGAC
                SET count = ?   
                WHERE gene_id = ? ");

  # EXONS

  $sth_find_exon = $dbh -> prepare("
                SELECT gene_id, exon_num
                FROM exons   
                WHERE gene_id = ? AND exon_num = ? ");

  $sth_find_subexon = $dbh -> prepare("
                SELECT sub_type, sub_locus_start
                FROM subexons
                WHERE gene_id = ? AND exon_num = ?
                                  AND sub_type = ? ");

  $sth_update_exon = $dbh ->prepare("
                UPDATE IGNORE exons
                SET type = ?, tigr_id = ?, locus_start = ?,
                    locus_end = ?
                WHERE gene_id = ? AND exon_num = ?");

  $sth_update_subexon = $dbh -> prepare("
                UPDATE IGNORE subexons  
                SET sub_locus_start = ?,
                    sub_locus_end = ?
                WHERE gene_id = ? AND exon_num = ?
                                  AND sub_type = ? ");

  $sth_insert_exon = $dbh -> prepare("
                INSERT INTO exons
                (gene_id, exon_num, type, tigr_id,
                 locus_start, locus_end)
                VALUES ( ?,?,?,?,?,? ) ");

  $sth_insert_subexon = $dbh -> prepare("
                INSERT INTO subexons
                (gene_id, exon_num, sub_type,
                 sub_locus_start, sub_locus_end)
                VALUES ( ?,?,?,?,? ) "); 


### OPEN GENE DATAFILE  #########################################

  # These are the test chromosome data files. They have the same
  # format as the bac files did above.
#  open (INFILE, "gene_info_08102001.txt") or die "Couldn't open chrfile:$1";
    open (INFILE, "genes.ht") or die "Couldn't open infile: $1";

### FILL GENES TABLE #########################################

  # Reset the counter to zero so we can see how many gene records
  # are processed. Separator is set to five dashes.
  $n = 0;
  $/ = "-----";
  while($record = <INFILE>)
  {
    print "\nz$n\n";
    $n++;
    @newrecord = ();
    @newrecord = CleanData( $record );  # Remove newlines from data.
    @gene=();

    # If there is an array returned from CleanData(), parse
    # it. This removes the chance of empty records causing
    # extra null records in the database.
    if ( @newrecord )
    {
      @gene = ParseGenes( \@newrecord );

      $sth_match_gene -> execute( $gene[1] );
      $temp = $sth_match_gene -> fetchrow_array();

      if ( $temp )
      {
        # Execute the update statement, since this tigr
        # reference number appears to be listed already.
        # The bind data are the returned elements from the
        # ParseGenes() subroutine. The ref# is not updated.
        $sth_update_genes -> execute ( $gene[0], $gene[2],
                   $gene[3], $gene[4], $gene[5], $gene[6], 
                   $gene[7], $gene[8], $gene[9], $gene[10],
                   $gene[11], $gene[12], $gene[1] );
      }
      else
      {
        # Execute the previously made statement to insert the data
        # from this record into the genes table.
        $sth_genes -> execute( $gene[0], $gene[1], $gene[2],
                     $gene[3], $gene[4], $gene[5], $gene[6],
                     $gene[7], $gene[8], $gene[9], $gene[10],
                     $gene[11], $gene[12] );
      }
    }

### END GENES TABLE FILL #########################################

### FILL RELATEGAC TABLE #########################################

    # If there is a bac associated with this gene, insert the
    # association into the relateGAC table. The gene[12] element
    # is the bac name from the data.
    if( $gene[12] )
    {
      $current_bac = $gene[12];

      # Find the unique gene id number for the current tigr 
      # gene by matching the locus name (element 4 of the gene
      # array from parsing). Usually, this would be the last
      # entered gene, but there may be duplicates. This method
      # should avoid duplication.
      $sth_select_gene_id -> execute ( $gene[4] );
      @temp = ();
      @temp = $sth_select_gene_id -> fetchrow_array();
      $geneid = $temp[0];

      # If this statement is able to execute, continue.
      if ($sth_find_match-> execute( $geneid, $current_bac ) )  
      {
        # Get the data from the matching row.
        $temp = 0;
        $temp = $sth_find_match -> fetchrow_array();

        # If any matching rows are found, update the count. 
        # Do not insert another row.
        if ( $temp )
        {
         @count = ();
          $sth_getcount -> execute( $geneid );
          @count = $sth_getcount -> fetchrow_array();
          $count[0]++;
          print "Gene id number is $geneid.\nCount is : $count[0]\n";
          $sth_updatecount -> execute ( $count[0], $geneid );
        }
        # If there are no matches (@temp is undefined), insert  
        # the new row, with a count of "1" automatically entered
        # from the prepare statement, and the last used unique
        # gene id number.
        else
        {
          $sth_rgac -> execute( $sth_genes->{mysql_insertid} ,
                                $current_bac );
        }
      }  
      else
      {
        print "\n\$sth_find_match was unable to execute,\n
               $current_bac from gene $geneid
               was not processed : $DBI::errstr\n";

      } # End if loop for insertion/count update routine

    } # End if loop for $gene[12] test (BAC name existance)

### END RELATEGAC TABLE FILL################################

### FILL EXONS TABLE########################################
    if ( @newrecord )
    {    
      foreach $line ( @newrecord )
      {
        @exondata=();
        next unless $line=~m/EXON\b/;

        if ( $line=~m/^\.{5}/ )         # line contains exon data
        {
          # Elements are ( exon_num, type, tigr_id, locus_start, locus_end )
          @exondata=ParseExon( $line );

          $sth_find_exon -> execute( $geneid, $exondata[0] );
          if ( $sth_find_exon -> fetchrow_array() )
          {
            $sth_update_exon -> execute( $exondata[1],$exondata[2],
                                         $exondata[3],$exondata[4],
                                         $geneid,     $exondata[0] );
#           print "Updated gene $geneid 's exon num   $exondata[0]\n";
          }
          else
          {
            $sth_insert_exon -> execute( $geneid, $exondata[0],
                                         $exondata[1], $exondata[2],
                                         $exondata[3], $exondata[4] );
#           print "Inserted gene $geneid 's exon num   $exondata[0]\n";
          }
        }
        else
        {
          # Elements will be ( exon_num, type, locus_start, locus_end )
          @exondata = ParseSubExon( $line );
          $sth_find_subexon -> execute( $geneid, $exondata[0], $exondata[1] );

          if ( $sth_find_subexon -> fetchrow_array() )
          {
            $sth_update_subexon -> execute( $exondata[2], $exondata[3],
                                            $geneid, $exondata[0],
                                            $exondata[1]);
#           print "Updated gene $geneid 's exon $exondata[0] 's subexon (type =$          }
          else
          {
            $sth_insert_subexon -> execute( $geneid,      $exondata[0],
                                            $exondata[1], $exondata[2],
                                            $exondata[3] );
#           print "Inserted gene $geneid 's exon $exondata[0] 's subexon (type $          }
        }
      } # End foreach block for @newrecord
    } # End exon handling
  } # End this record's entire while loop

  close INFILE;

### END EXONS TABLE FILL #########################################

### DISCONNECT NICELY ############################################

  # Finishing these statements is not absolutely necessary, but
  # it avoids an error. Sometimes the statements are not used.
  # If the statement isn't executed, then it's memory won't be
  # deallocated until the program ends ( which causes an error).
  $sth_match_achrom   -> finish();
  $sth_update_achroms -> finish();
  $sth_match_gene     -> finish();
  $sth_update_genes   -> finish();
  $sth_find_match     -> finish();
  $sth_getcount       -> finish();
  $sth_updatecount    -> finish();
  $sth_select_gene_id -> finish();
  $sth_find_exon      -> finish();
  $sth_find_subexon   -> finish();

  $dbh->disconnect
        or die "Couldn't disconnect from database: $DBI::errstr\n";
  exit;


### SUBROUTINES START HERE #######################################

sub CleanData
{
    my ($record, @cleanrecord, $line, @record);
    $record = $_[0];

    ### Separate the record by newlines.
    @record = split /\n/, $record;

    ### Remove empty lines if they exist.
    foreach $line ( @record )
    {
      chomp( $line );
      next if ( $line =~ m/^\s*$/ );
      push @cleanrecord, $line;
    }
    return ( @cleanrecord );
}

### PARSE ARTIFICIAL CHROMOSOMES ############################
sub ParseAChroms
{
  my (@achrom, $line, $type, $ac_id, $length, $chr_num,
      $chr_strt, $chr_end, $bac_strt, $bac_end, $orient);

  my $achromref = $_[0];
  foreach $line (@$achromref)
  {
    # Get type. Right now, all are BAC's.
    if ($line =~ m/\b((B|Y)AC)\b/)
    {
      $type = $1;
    }
    
    # Get reference number.
    if ($line =~ m/BAC:\s*(.*)\s*$/)
    {
      $ac_id = $1;
    }

    # Get artificial chromosome length
    if ($line =~ m/LENGTH:\s*(.*)\s*$/)
    {
      $length = $1;
    }

    # Get associated chromosome number
    if ($line =~ m/OSOME:\s*(.*)\s*$/)
    {
      $chr_num = $1;
    }

    # Get associated chromosome start base.
    if ($line =~ m/CHR_START:\s*(.*)\s*$/)
    {
      $chr_strt = $1;
    }

    # Get associated chromosome ending base.
    if ($line =~ m/CHR_END:\s*(.*)\s*$/)
    {
      $chr_end = $1;
    }

    # Get artificial chromosome start base.
    if ($line =~ m/BAC_START:\s*(.*)\s*$/)
    {
      $bac_strt = $1;
    }

    # Get artificial chromosome ending base.
    if ($line =~ m/BAC_END:\s*(.*)\s*$/)
    {
      $bac_end = $1;
    }

    # Get associate chromosome's orientation on the ac.
    if ($line =~ m/ORIENTATION:\s*(\w)\w*\s*$/)
    {
      $orient = $1;
    }
  }
  push @achrom, $type, $ac_id, $length, $chr_num,
      $chr_strt, $chr_end, $bac_strt, $bac_end, $orient;

  return @achrom;
}

########## PARSE GENES ##########################################
sub ParseGenes
{
  my ( $line, @gene, $chrome_num, $bac, $tigr_id,
       $bac_locus, $chromo_locus, $model_begin, $model_end,
       $orient, $read_frame, $num_exons, $prod, $notes, $pseudo);

  my $generef = $_[0];

  foreach $line (@$generef)
  {  
    # Get chromosome number
    if ($line =~ m/CHROMOSOME:\s*(\d*)\s*$/)
    {
      $chrome_num = $1;
    }

    # Get TIGR model's reference id
    if ($line =~ m/MODEL_NAME:\s*(.*)$/)
    {
      $tigr_id = $1;
    }

    # Get the locus on the bac where the gene was cloned.
    if ($line =~ m/BAC_LOCUS:\s*(.*)\s*$/)
    {
      $bac_locus = $1;
    }

    # Get full name
    if ($line =~ m/COMMON_NAME:\s*(.*)$/)
    {
      $prod = $1;
    }

    if ( $line =~ m/PSEUDOGENE:\s*(\w*)\s*$/)
    {
      $pseudo = $1;
    }

    # Get chromosome locus identifier
    if ($line =~ m/CHROMO_LOCUS:\s*(.*)\s*$/)
    {
      $chromo_locus = $1;
    }

    # Get starting place
    if ($line =~ m/MODEL_BEGIN:\s*(\d+)\s*$/)
    {
      $model_begin = $1;
    }

    # Get ending place
    if ($line =~ m/MODEL_END:\s*(\d+)\s*$/)
    {
      $model_end = $1;
    }

    # Get whether forward or reverse
    if ($line =~ m/ORIENTATION:\s*(\w)_(\d)\s*$/)
    {
      $orient = $1;
      $read_frame = $2;
    }

    # Get number of exons in the gene
    if ($line =~ m/NUM_EXONS:\s*(\d+)\s*$/)
    {
      $num_exons = $1;
    }

    # Get any notes attached to the gene
    if ($line =~ m/COMMENT:\s*(.*)$/)
    {
      $notes = $1;
    }

    # Get the bac where the gene was cloned.
    if ($line =~ m/BAC:\s*(.*)\s*$/)
    {
      $bac = $1;
    }
  }
  push @gene, $chrome_num, $tigr_id, $bac_locus, $pseudo, $chromo_locus,
       $model_begin, $model_end, $orient, $read_frame, $num_exons, $prod,
       $notes, $bac;
  return @gene;
}

########## PARSE EXONS ##########################################

sub ParseExon
{
  my $line = $_[0];
  my @exon=();

  # Get exon's order number.
  $line=~m/EXON\s(\d+)/;
  push @exon, $1;

  # Get the exon type from inside the parentheses
  $line=~m/\((.*)\)/;
  push @exon, $1;

  # Get TIGR's model number
  $line=~m/\b(\d*\.e\d*)\b/;
  push @exon, $1;

  # Get start position
  $line=~m/pos:\s*\b(\d*)\b/;
  push @exon, $1;

  # Get end position
  $line=~m/\b(\d*)\s*$/;

  push @exon, $1;

  return (@exon);
}

########## PARSE SUB EXONS ##########################################

sub ParseSubExon
{
  my $line = $_[0];
  my @subexon=();

  # Get parent exon's order number.
  $line=~m/EXON\s(\d+)/;
  push @subexon, $1;

  # Get the subexon type
  $line=~m/\d\s*:\s*(.*)\s*:\s*pos/;
  push @subexon, $1;
  # Get start position
  $line=~m/pos:\s*(\d*)\b/;
  push @subexon, $1;

  # Get end position
  $line=~m/\d-\s*(\d*)\s*$/;
  push @subexon, $1;

  return ( @subexon );

}