Mercurial > repos > devteam > compute_motifs_frequency
comparison compute_motifs_frequency.pl @ 0:d66f925bfbeb draft
Uploaded tool tarball.
| author | devteam |
|---|---|
| date | Tue, 20 Aug 2013 09:31:57 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:d66f925bfbeb |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 # a program to compute the frequency of each motif at each window in both upstream and downstream sequences flanking indels | |
| 4 # in a chromosome/genome. | |
| 5 # the first input is a TABULAR format file containing the motif names and sequences, such that the file consists of two | |
| 6 # columns: the left column represents the motif names and the right column represents the motif sequence, one line per motif. | |
| 7 # the second input is a TABULAR format file containing the upstream and downstream sequences flanking indels, one line per indel. | |
| 8 # the fourth input is an integer number representing the window size according to which the upstream and downstream sequences | |
| 9 # flanking each indel will be divided. | |
| 10 # the first output is a TABULAR format file containing the windows into which both upstream and downstream sequences flanking | |
| 11 # indels are divided. | |
| 12 # the second output is a TABULAR format file containing the motifs and their corresponding frequencies at each window in both | |
| 13 # upstream and downstream sequences flanking indels, one line per motif. | |
| 14 | |
| 15 use strict; | |
| 16 use warnings; | |
| 17 | |
| 18 #variable to handle the falnking sequences information | |
| 19 my $sequence = ""; | |
| 20 my $upstreamFlankingSequence = ""; | |
| 21 my $downstreamFlankingSequence = ""; | |
| 22 my $discardedSequenceLength = 0; | |
| 23 my $lengthOfDownstreamFlankingSequenceAfterTrimming = 0; | |
| 24 | |
| 25 #variable to handle the window information | |
| 26 my $window = ""; | |
| 27 my $windowStartIndex = 0; | |
| 28 my $windowNumber = 0; | |
| 29 my $totalWindowsNumber = 0; | |
| 30 my $totalNumberOfWindowsInUpstreamSequence = 0; | |
| 31 my $totalNumberOfWindowsInDownstreamSequence = 0; | |
| 32 my $totalWindowsNumberInBothFlankingSequences = 0; | |
| 33 my $totalWindowsNumberInMotifCountersTwoDimArray = 0; | |
| 34 my $upstreamAndDownstreamFlankingSequencesWindows = ""; | |
| 35 | |
| 36 #variable to handle the motif information | |
| 37 my $motif = ""; | |
| 38 my $motifSequence = ""; | |
| 39 my $motifNumber = 0; | |
| 40 my $totalMotifsNumber = 0; | |
| 41 | |
| 42 #arrays to sotre window and motif data | |
| 43 my @windowsArray = (); | |
| 44 my @motifNamesArray = (); | |
| 45 my @motifSequencesArray = (); | |
| 46 my @motifCountersTwoDimArray = (); | |
| 47 | |
| 48 #variables to store line counter values | |
| 49 my $lineCounter1 = 0; | |
| 50 my $lineCounter2 = 0; | |
| 51 | |
| 52 # check to make sure having correct files | |
| 53 my $usage = "usage: compute_motifs_frequency.pl [TABULAR.in] [TABULAR.in] [windowSize] [TABULAR.out] [TABULAR.out]\n"; | |
| 54 die $usage unless @ARGV == 5; | |
| 55 | |
| 56 #get the input and output arguments | |
| 57 my $motifsInputFile = $ARGV[0]; | |
| 58 my $indelFlankingSequencesInputFile = $ARGV[1]; | |
| 59 my $windowSize = $ARGV[2]; | |
| 60 my $indelFlankingSequencesWindowsOutputFile = $ARGV[3]; | |
| 61 my $motifFrequenciesOutputFile = $ARGV[4]; | |
| 62 | |
| 63 #open the input and output files | |
| 64 open (INPUT1, "<", $motifsInputFile) || die("Could not open file $motifsInputFile \n"); | |
| 65 open (INPUT2, "<", $indelFlankingSequencesInputFile) || die("Could not open file $indelFlankingSequencesInputFile \n"); | |
| 66 open (OUTPUT1, ">", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n"); | |
| 67 open (OUTPUT2, ">", $motifFrequenciesOutputFile) || die("Could not open file $motifFrequenciesOutputFile \n"); | |
| 68 | |
| 69 #store the motifs input file in the array @motifsData | |
| 70 my @motifsData = <INPUT1>; | |
| 71 | |
| 72 #iterated through the motifs (lines) of the motifs input file | |
| 73 foreach $motif (@motifsData){ | |
| 74 chomp ($motif); | |
| 75 #print ($motif . "\n"); | |
| 76 | |
| 77 #split the motif data into its name and its sequence | |
| 78 my @motifNameAndSequenceArray = split(/\t/, $motif); | |
| 79 | |
| 80 #store the name of the motif into the array @motifNamesArray | |
| 81 push @motifNamesArray, $motifNameAndSequenceArray[0]; | |
| 82 | |
| 83 #store the sequence of the motif into the array @motifSequencesArray | |
| 84 push @motifSequencesArray, $motifNameAndSequenceArray[1]; | |
| 85 } | |
| 86 | |
| 87 #compute the size of the motif names array | |
| 88 $totalMotifsNumber = @motifNamesArray; | |
| 89 | |
| 90 #store the input file in the array @sequencesData | |
| 91 my @sequencesData = <INPUT2>; | |
| 92 | |
| 93 #iterated through the sequences of the second input file in order to create windwos file | |
| 94 foreach $sequence (@sequencesData){ | |
| 95 chomp ($sequence); | |
| 96 $lineCounter1++; | |
| 97 | |
| 98 my @indelAndSequenceArray = split(/\t/, $sequence); | |
| 99 | |
| 100 #get the upstream falnking sequence | |
| 101 $upstreamFlankingSequence = $indelAndSequenceArray[3]; | |
| 102 | |
| 103 #if the window size is 0, then the whole upstream will be one window only | |
| 104 if ($windowSize == 0){ | |
| 105 $totalNumberOfWindowsInUpstreamSequence = 1; | |
| 106 $windowSize = length ($upstreamFlankingSequence); | |
| 107 } | |
| 108 else{ | |
| 109 #compute the total number of windows into which the upstream flanking sequence will be divided | |
| 110 $totalNumberOfWindowsInUpstreamSequence = length ($upstreamFlankingSequence) / $windowSize; | |
| 111 | |
| 112 #compute the length of the subsequence to be discared from the upstream flanking sequence if any | |
| 113 $discardedSequenceLength = length ($upstreamFlankingSequence) % $windowSize; | |
| 114 | |
| 115 #check if the sequence could be split into windows of equal sizes | |
| 116 if ($discardedSequenceLength != 0) { | |
| 117 #trim the upstream flanking sequence | |
| 118 $upstreamFlankingSequence = substr($upstreamFlankingSequence, $discardedSequenceLength); | |
| 119 } | |
| 120 } | |
| 121 | |
| 122 #split the upstream flanking sequence into windows | |
| 123 for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInUpstreamSequence; $windowNumber++){ | |
| 124 $windowStartIndex = $windowNumber * $windowSize; | |
| 125 print OUTPUT1 (substr($upstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t"); | |
| 126 } | |
| 127 | |
| 128 #add a column representing the indel | |
| 129 print OUTPUT1 ("indel" . "\t"); | |
| 130 | |
| 131 #get the downstream falnking sequence | |
| 132 $downstreamFlankingSequence = $indelAndSequenceArray[4]; | |
| 133 | |
| 134 #if the window size is 0, then the whole upstream will be one window only | |
| 135 if ($windowSize == 0){ | |
| 136 $totalNumberOfWindowsInDownstreamSequence = 1; | |
| 137 $windowSize = length ($downstreamFlankingSequence); | |
| 138 } | |
| 139 else{ | |
| 140 #compute the total number of windows into which the downstream flanking sequence will be divided | |
| 141 $totalNumberOfWindowsInDownstreamSequence = length ($downstreamFlankingSequence) / $windowSize; | |
| 142 | |
| 143 #compute the length of the subsequence to be discared from the upstream flanking sequence if any | |
| 144 $discardedSequenceLength = length ($downstreamFlankingSequence) % $windowSize; | |
| 145 | |
| 146 #check if the sequence could be split into windows of equal sizes | |
| 147 if ($discardedSequenceLength != 0) { | |
| 148 #compute the length of the sequence to be discarded | |
| 149 $lengthOfDownstreamFlankingSequenceAfterTrimming = length ($downstreamFlankingSequence) - $discardedSequenceLength; | |
| 150 | |
| 151 #trim the downstream flanking sequence | |
| 152 $downstreamFlankingSequence = substr($downstreamFlankingSequence, 0, $lengthOfDownstreamFlankingSequenceAfterTrimming); | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 #split the downstream flanking sequence into windows | |
| 157 for ($windowNumber = 0; $windowNumber < $totalNumberOfWindowsInDownstreamSequence; $windowNumber++){ | |
| 158 $windowStartIndex = $windowNumber * $windowSize; | |
| 159 print OUTPUT1 (substr($downstreamFlankingSequence, $windowStartIndex, $windowSize) . "\t"); | |
| 160 } | |
| 161 | |
| 162 print OUTPUT1 ("\n"); | |
| 163 } | |
| 164 | |
| 165 #compute the total number of windows on both upstream and downstream sequences flanking the indel | |
| 166 $totalWindowsNumberInBothFlankingSequences = $totalNumberOfWindowsInUpstreamSequence + $totalNumberOfWindowsInDownstreamSequence; | |
| 167 | |
| 168 #add an additional cell to store the name of the motif and another one for the indel itself | |
| 169 $totalWindowsNumberInMotifCountersTwoDimArray = $totalWindowsNumberInBothFlankingSequences + 1 + 1; | |
| 170 | |
| 171 #initialize the two dimensional array $motifCountersTwoDimArray. the first column will be initialized with motif names | |
| 172 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ | |
| 173 | |
| 174 for ($windowNumber = 0; $windowNumber < $totalWindowsNumberInMotifCountersTwoDimArray; $windowNumber++){ | |
| 175 | |
| 176 if ($windowNumber == 0){ | |
| 177 $motifCountersTwoDimArray [$motifNumber] [0] = $motifNamesArray[$motifNumber]; | |
| 178 } | |
| 179 elsif ($windowNumber == $totalNumberOfWindowsInUpstreamSequence + 1){ | |
| 180 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = "indel"; | |
| 181 } | |
| 182 else{ | |
| 183 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] = 0; | |
| 184 } | |
| 185 } | |
| 186 } | |
| 187 | |
| 188 close(OUTPUT1); | |
| 189 | |
| 190 #open the file the contains the windows of the upstream and downstream flanking sequences, which is actually the first output file | |
| 191 open (INPUT3, "<", $indelFlankingSequencesWindowsOutputFile) || die("Could not open file $indelFlankingSequencesWindowsOutputFile \n"); | |
| 192 | |
| 193 #store the first output file containing the windows of both upstream and downstream flanking sequences in the array @windowsData | |
| 194 my @windowsData = <INPUT3>; | |
| 195 | |
| 196 #iterated through the lines of the first output file. Each line represents | |
| 197 #the windows of the upstream and downstream flanking sequences of an indel | |
| 198 foreach $upstreamAndDownstreamFlankingSequencesWindows (@windowsData){ | |
| 199 | |
| 200 chomp ($upstreamAndDownstreamFlankingSequencesWindows); | |
| 201 $lineCounter2++; | |
| 202 | |
| 203 #split both upstream and downstream flanking sequences into their windows | |
| 204 my @windowsArray = split(/\t/, $upstreamAndDownstreamFlankingSequencesWindows); | |
| 205 | |
| 206 $totalWindowsNumber = @windowsArray; | |
| 207 | |
| 208 #iterate through the windows to search for matched motifs and increment their corresponding counters accordingly | |
| 209 WINDOWS: | |
| 210 for ($windowNumber = 0; $windowNumber < $totalWindowsNumber; $windowNumber++){ | |
| 211 | |
| 212 #get the window | |
| 213 $window = $windowsArray[$windowNumber]; | |
| 214 | |
| 215 #if the window is the one that contains the indel, then skip the indel window | |
| 216 if ($window eq "indel") { | |
| 217 next WINDOWS; | |
| 218 } | |
| 219 else{ #iterated through the motif sequences to check their occurrences in the winodw | |
| 220 #and increment their corresponding counters accordingly | |
| 221 | |
| 222 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ | |
| 223 #get the motif sequence | |
| 224 $motifSequence = $motifSequencesArray[$motifNumber]; | |
| 225 | |
| 226 #if the motif is found in the window, then increment its corresponding counter | |
| 227 if ($window =~ m/$motifSequence/i){ | |
| 228 $motifCountersTwoDimArray [$motifNumber] [$windowNumber + 1]++; | |
| 229 } | |
| 230 } | |
| 231 } | |
| 232 } | |
| 233 } | |
| 234 | |
| 235 #store the motif counters values in the second output file | |
| 236 for ($motifNumber = 0; $motifNumber < $totalMotifsNumber; $motifNumber++){ | |
| 237 | |
| 238 for ($windowNumber = 0; $windowNumber <= $totalWindowsNumber; $windowNumber++){ | |
| 239 | |
| 240 print OUTPUT2 $motifCountersTwoDimArray [$motifNumber] [$windowNumber] . "\t"; | |
| 241 #print ($motifCountersTwoDimArray [$motifNumber] [$windowNumber] . " "); | |
| 242 } | |
| 243 print OUTPUT2 "\n"; | |
| 244 #print ("\n"); | |
| 245 } | |
| 246 | |
| 247 #close the input and output files | |
| 248 close(OUTPUT2); | |
| 249 close(OUTPUT1); | |
| 250 close(INPUT3); | |
| 251 close(INPUT2); | |
| 252 close(INPUT1); |
