bioinformatics has asked for the wisdom of the Perl Monks concerning the following question:
Main data file:#!/usr/bin/perl -w use strict; our (@window, @probe)=(); our @main = &open_file_main(); our @annot = &open_file_annot(); &outfile; # loop through the main data file OLC: foreach my $md (@main) { # remove newlines chomp $md; # pull out chromosome #, window start, end my ($main_chrom, $winl, $winr) = split(/\t/, $md); # put the window start, end into array for further processing @window = ($winl, $winr); # loop through the "annotation" file ILC: foreach my $ad (@annot) { # pull out the chromosome #, window start, end my ($an_chrom, undef, undef, $prol, $pror, undef, undef, undef +, $mess) = split(/\t/, $ad); # make sure the chromosomes match to save processor time, skip + if no match if ($main_chrom ne $an_chrom) {next ILC;} # get the gene name fromt the $mess variable my (undef, undef, $name) = split(/\;/, $mess); # load the window start sites for the individual probes @probe = ($prol, $pror); # call the range_finding sub to look for matches my $return = range_find(); if ($return eq 1) { # upon matching, print out the name of the gene along with + the original values # and print OUTPUT "$name\t $md\n"; next OLC; } else {next ILC;} } } close OUTPUT; exit; sub open_file_main { # open the file, pull in the data print " What is the name of the ChIPOTle file\?"; chomp (my $file = <STDIN>); open (FILE, $file) || die "Cannot open $!"; my @data = <FILE>; close FILE; return @data; } sub open_file_annot { # open the file, pull in the data print " What is the name of the annotation file\?"; chomp (my $file = <STDIN>); open (FILE, $file) || die "Cannot open $!"; my @data = <FILE>; close FILE; return @data; } sub outfile { # open the outputfile open (OUTPUT, ">output.txt")|| die "Cannot open $!"; } sub calc_range { # simple sub to create the full window from the start + and end sites my @peaks = @_; my @peak_range = ($peaks[0] .. $peaks[1]); return @peak_range; } sub range_find { # loop in loop to look for ANY overlap of the values; + will be a true/false return my @range1 = &calc_range(@window); my @range2 = &calc_range(@probe); my $test = pop @range2; my $test2 = pop @range1; OLC2: foreach my $value1 (@range1) { # look to see if the windows don't overlap if ($test lt $window[0]) {return 0;} # look to see if the windows don't overlap elsif ($test2 lt $probe[0]) {return 0;} ILC2: foreach my $value2 (@range2) { if ( $value1 eq $value2) {return 1;} else {next ILC2;} } } return 0; }
Annotation Data:chr10 726178 726428 1.121753867297440e-012 1.1607063976283 +87e-015 6 4.81000 chr10 4922028 4922428 2.163402534569952e-012 7.06701849934 +5696e-014 9 4.43000 chr10 5126478 5127178 8.348017333255820e-013 8.24324954124 +5406e-021 15 4.42000 chr10 14649778 14650028 2.090598548899472e-013 6.000728326 +245207e-018 6 5.04000 chr10 14651328 14651428 2.915017653920210e-012 1.890051416 +361198e-014 3 4.20000
chr1 NGS primary_transcript 4224 7502 -1 - . I +D=00003; accession=BC063682; Name=FLJ25222; ncbi_gene_id=374666; syno +nyms=-; description=CXYorf1-related protein; url=http://www.ncbi.nlm. +nih.gov/entrez/query.fcgi?db%3Dnucleotide%26cmd%3Dsearch%26term%3DBC0 +63682 chr1 NGS transcription_start_site 7502 7502 -1 - +. Parent=00003; accession=BC063682; Name=FLJ25222; ncbi_gene_id=37 +4666; synonyms=-; description=CXYorf1-related protein chr1 NGS primary_transcript 4268 7438 -2 - . I +D=00005; accession=BC073913; Name=MGC52000; ncbi_gene_id=375260; syno +nyms=CXYorf1|MGC104889|MGC111476|MGC117230|MGC90409; description=CXYo +rf1-related protein; url=http://www.ncbi.nlm.nih.gov/entrez/query.fcg +i?db%3Dnucleotide%26cmd%3Dsearch%26term%3DBC073913 chr1 NGS transcription_start_site 7438 7438 -2 - +. Parent=00005; accession=BC073913; Name=MGC52000; ncbi_gene_id=37 +5260; synonyms=CXYorf1|MGC104889|MGC111476|MGC117230|MGC90409; descri +ption=CXYorf1-related protein chr1 NGS primary_transcript 4268 14754 -3 - . +ID=00004; accession=BC048328; Name=MGC52000; ncbi_gene_id=375260; syn +onyms=CXYorf1|MGC104889|MGC111476|MGC117230|MGC90409; description=CXY +orf1-related protein; url=http://www.ncbi.nlm.nih.gov/entrez/query.fc +gi?db%3Dnucleotide%26cmd%3Dsearch%26term%3DBC048328 chr1 NGS transcription_start_site 14754 14754 -3 - + . Parent=00004; accession=BC048328; Name=MGC52000; ncbi_gene_id= +375260; synonyms=CXYorf1|MGC104889|MGC111476|MGC117230|MGC90409; desc +ription=CXYorf1-related protein chr1 NGS primary_transcript 4268 19697 -4 - . +ID=00006; accession=BC110996; Name=MGC52000; ncbi_gene_id=375260; syn +onyms=CXYorf1|MGC104889|MGC111476|MGC117230|MGC90409; description=CXY +orf1-related protein; url=http://www.ncbi.nlm.nih.gov/entrez/query.fc +gi?db%3Dnucleotide%26cmd%3Dsearch%26term%3DBC110996
|
---|
Replies are listed 'Best First'. | |
---|---|
Re: Faulty Control Structures?
by Narveson (Chaplain) on Jan 28, 2008 at 20:50 UTC | |
by bioinformatics (Friar) on Jan 28, 2008 at 23:22 UTC | |
by Narveson (Chaplain) on Jan 29, 2008 at 04:55 UTC | |
Re: Faulty Control Structures?
by Errto (Vicar) on Jan 28, 2008 at 20:33 UTC | |
by bioinformatics (Friar) on Jan 28, 2008 at 20:46 UTC | |
Re: Faulty Control Structures?
by BrowserUk (Pope) on Jan 29, 2008 at 06:49 UTC | |
Re: Faulty Control Structures?
by dragonchild (Archbishop) on Jan 28, 2008 at 22:46 UTC | |
by bioinformatics (Friar) on Jan 28, 2008 at 23:31 UTC | |
Re: Faulty Control Structures?
by apl (Monsignor) on Jan 28, 2008 at 20:31 UTC | |
by bioinformatics (Friar) on Jan 28, 2008 at 20:43 UTC | |
Re: Faulty Control Structures?
by GrandFather (Saint) on Jan 29, 2008 at 05:00 UTC |