#!/usr/bin/perl use strict; use warnings; my (@origin, $y); my $N_Sequences = 100; my @Alphabet = split(//,'ACGT'); my $P_Consensus = 0.85; # This is the probability of dominant letter # ====== Globals ========================== my @Probabilities; # Stores the probability of each character # ====== Program ========================== open (ORIGIN, "< origin8.txt"); # This file holds 200 sequences used for motif template chomp (@origin = ); close ORIGIN; for ($y=0; $y<=$#origin; $y++) { my @Motif = split(//,'$origin[$y]'); # This is a loop to get the motif template from origin8 open (OUT_NORM, ">short_sequences8_[$y].txt") or die "Unable to open file :$!"; for (my $i=0; $i < $N_Sequences; $i++) { for (my $j=0; $j < scalar(@Motif); $j++) { loadConsensusCharacter($Motif[$j]); addNoiseToDistribution(); convertToIntervals(); print OUT_NORM (getRandomCharacter(rand(1.0))); } print OUT_NORM "\n"; make_files(); } } exit(); # ====== Subroutines ======================= # sub loadConsensusCharacter { my ($char) = @_; my $Found = 'FALSE'; for (my $i=0; $i < scalar(@Alphabet); $i++) { if ( $char eq $Alphabet[$i]) { $Probabilities[$i] = 1.0; $Found = 'TRUE'; } else { $Probabilities[$i] = 0.0; } } if ($Found eq 'FALSE') { die("Panic: Motif-Character\"$char\" was not found in Alphabet. Aborting.\n"); } return(); } # ========================================== sub addNoiseToDistribution { my $P_NonConsensus = ( 1.0-$P_Consensus) / (scalar(@Alphabet) - 1); for (my $i=0; $i < scalar(@Probabilities); $i++) { if ( $Probabilities[$i] == 1.0 ) { $Probabilities[$i] = $P_Consensus; } else { $Probabilities[$i] = $P_NonConsensus; } } return(); } # ========================================== sub convertToIntervals { my $Sum = 0; for (my $i=1; $i < scalar(@Probabilities); $i++) { $Probabilities[$i] += $Probabilities[$i-1]; } return(); } # ========================================== sub getRandomCharacter { my ($RandomNumber) = @_; my $i=0; for ($i=0; $i < scalar(@Probabilities); $i++) { if ($Probabilities[$i] > $RandomNumber) { last; } } return($Alphabet[$i]); } # ========================================== sub make_files { my (@short, @long,$x,$r, $output_norm); open (SHORT, "< short_sequences8_[$y].txt"); chomp (@short = ); close SHORT; open (LONG, "< long_sequences.txt"); chomp (@long = ); close LONG; open (OUT_INITIAL, "> output8_[$y]1.txt"); open (OUT_REPLACED, "> output8_[$y]2.txt"); for ($x=0; $x<=$#short; $x++) { $r=2; print OUT_INITIAL ">SeqName$x\n$long[$x]\n"; print OUT_REPLACED "SeqName$x\n" . substr($long[$x], $r, length $short[$x]) . "\n";} close OUT_INITIAL; close OUT_REPLACED; } #### TTTATAAT TGTCAATG CGTTGATG CGTCCTAG GGCTTCCA ATTAGCCT GTCCTGAT TGTAAATC CGCTTATT TTGACATA CCTGATAT ATGAATCG CGTCCGAT TGGCCCAT ATCCTGAT TGCCCATT CCCTAACT AAAAAAAA TTTTTTTT CCCCCCCC GGGGGGGG AAAAAAAT AAAAAAAG AAAAAAAC AAAAAACC AAAAAATT AAAAAAGG AAAAAACT AAAAAACG AAAAAACA AAAAACAA AAAACAAA AAACAAAA AACAAAAA ACAAAAAA CAAAAAAA AAAAAATA AAAAATAA AAAATAAA AAATAAAA AATAAAAA ATAAAAAA TAAAAAAA AAAAAAGA AAAAAGAA AAAAGAAA AAAGAAAA AAGAAAAA AGAAAAAA GAAAAAAA AAAACCAA AACCAAAA CCAAAAAA AAAATTAA AATTAAAA TTAAAAAA AAAAACCC AAAACCCA AAACCCAA AACCCAAA ACCCAAAA CCCAAAAA AAAAATTT AAAATTTA AAATTTAA AATTTAAA ATTTAAAA TTTAAAAA AAAAAGGG AAAAGGGA AAAGGGAA AAGGGAAA AGGGAAAA GGGAAAAA AAAACCCC AAACCCCA AACCCCAA ACCCCAAA CCCCAAAA AAAATTTT AAATTTTA AATTTTAA ATTTTAAA TTTTAAAA AAAAGGGG AAAGGGGA AAGGGGAA AGGGGAAA GGGGAAAA AAACCCCC AACCCCCA ACCCCCAA CCCCCAAA AAATTTTT AATTTTTA ATTTTTAA TTTTTAAA AAAGGGGG AAGGGGGA AGGGGGAA GGGGGAAA AAGGGGGG AGGGGGGA GGGGGGAA AACCCCCC ACCCCCCA CCCCCCAA AATTTTTT ATTTTTTA TTTTTTAA ATTTTTTT TTTTTTTA ACCCCCCC CCCCCCCA AGGGGGGG GGGGGGGA ATTTTTTT TTTTTTTA ATAAAATA AATAAATA AAATAATA AAAATATA ACAAAACA AACAAACA AAACAACA AAAACACA AGAAAAGA AAGAAAGA AAAGAAGA AAAAGAGA ATAAAAGA ATAAAACA AGAAAATA AGAAAACA ACAAAAGA ACAAAATA ATTAAATA AATTAATA AAATTATA ACCAAACA AACCAACA AAACCACA AGGAAAGA AAGGAAGA AAAGGAGA ATTTAATA AATTTATA ACCCAACA AACCCACA AGGGAAGA AAGGGAGA ATTTAACA ATTTAAGA AATTTACA AATTTAGA ACCCAATA ACCCAAGA AACCCATA AACCCAGA AGGGAACA AGGGAATA AAGGGATA AAGGGACA TTGGGACA CCGGGACA AGAAGGGA TGCCCATA TAAAAAAT TGCCTATA CCGTAGTC ACTTGACT CTGATCCC TGTGACTA CCTGATCC CCTGAACC TGATCACG GGGTAACC CTTTTGAA TTGTATGA CCTGATAA CTGGTTAG CCCCGACC TTGGGGAC GGTTTGAC GCTTAGAC GTTACACC TTGTACCA TGGTACCA CCGTACAT CCCTTGCC GTGTTGGT ATCGATCG ACGTACGT TCAGTCAG GCTATACG GTCCATAC CCGTCCGT ATATATCC GTGTCCCC