Mercurial > repos > pjbriggs > motif_tools
comparison TFBScluster_candidates.pl @ 3:b4e22d2d7fa7 draft
Add missing Perl scripts from previous uploads.
author | pjbriggs |
---|---|
date | Mon, 09 Apr 2018 04:45:06 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
2:96a4ee27b08a | 3:b4e22d2d7fa7 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # TFBScluster version 2.0 - cluster together TFBSs from distinct | |
4 # TFBSsearch generated libraries. | |
5 # | |
6 # (c) Ian Donaldson 2003 and Mike Chapman (TFBS overlap subs) | |
7 # | |
8 # This program is free software; you can redistribute it and/or | |
9 # modify it under the terms of the GNU General Public License | |
10 # as published by the Free Software Foundation; either version 2 | |
11 # of the License, or (at your option) any later version. | |
12 # | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU General Public License for more details. | |
17 # | |
18 # You should have received a copy of the GNU General Public License | |
19 # along with this program; if not, write to the Free Software | |
20 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
21 | |
22 use strict; | |
23 use FileHandle; | |
24 #use integer; # Force floating point into integers | |
25 | |
26 $|=1; | |
27 | |
28 ########### | |
29 # Program to determine whether a TF site is in close proximity to | |
30 # one or more other TF sites. | |
31 # September 2003. Ian Donaldson. | |
32 # Revised 8 Sept. 2003 to not include the query TF site in the threshold. | |
33 # This is to allow one to determine whether a TF is near to another of the | |
34 # same type. | |
35 # Revised 9 Sept to alter the threshold size to only include the core of a | |
36 # pattern i.e. gata of nngatann. | |
37 # Revised 19 Sept to replace query and subject libraries with the statement | |
38 # of all interested libraries. | |
39 # Revised 22 Sept to scan output for overlapping patterns. | |
40 # NOTE: Any overlap in comparative record start with start pattern | |
41 # Revised 30 Sept to ignore duplication of TFBS in a group record caused by | |
42 # palindrome. By skipping if name and positions the same. | |
43 # Revised 6 Oct code for tree searching method to deal with overlapping TFBSs | |
44 ########### | |
45 | |
46 #### Command line usage | |
47 if(@ARGV != 7) | |
48 { | |
49 die ("USAGE: | |
50 $0 | |
51 TF libraries \(comma delimited NO SPACES\) | |
52 Number of flanking 'N's for subject files \(comma delimited NO SPACES\) | |
53 Minimum number of occurences \(comma delimited NO SPACES\) | |
54 TF IDs \(comma delimited NO SPACES\) | |
55 Single range value in bp \(+/-\) query start and end values | |
56 Include overlapping TFBSs \(include/exclude\) | |
57 Output file\n\n"); | |
58 } | |
59 | |
60 #### File handles | |
61 my $subject = new FileHandle; | |
62 my $combined = new FileHandle; | |
63 my $sorted = new FileHandle; | |
64 my $groups = new FileHandle; | |
65 my $filtered_groups = new FileHandle; | |
66 my $output = new FileHandle; | |
67 my $output2 = new FileHandle; | |
68 | |
69 #### Variables | |
70 my @subject_files = (); # Array containing the names of selected subject files | |
71 my @flanking_n = (); # Array containing the number of flanking 'n' for each pattern | |
72 my @min_occur = (); # Array containing the minimum occurences for the TF library | |
73 my @ids = (); # Array containing user defined IDs for the TF libraries | |
74 my @file_sizes = (); | |
75 my @sorted_file_sizes = (); | |
76 my $range = $ARGV[4]; | |
77 my $allow = $ARGV[5]; | |
78 my @regex_ids = (); | |
79 | |
80 ##################################################### | |
81 #### Deal with user arguments processed into an array | |
82 ##################################################### | |
83 | |
84 #### Convert TF file names string to an array | |
85 @subject_files = split(/,/, $ARGV[0]); | |
86 | |
87 #### Convert flanking 'N' numbers string into an array | |
88 @flanking_n = split(/,/, $ARGV[1]); | |
89 | |
90 #### Convert minimum occurences string into an array | |
91 @min_occur = split(/,/, $ARGV[2]); | |
92 | |
93 #### Convert minimum occurences string into an array | |
94 @ids = split(/,/, $ARGV[3]); | |
95 | |
96 foreach my $id_string (@ids) { | |
97 if($id_string !~ /^[\w\d_,]+$/) { | |
98 die("A non-letter/number character was detected in your label string!\n"); | |
99 } | |
100 } | |
101 | |
102 #### Record how large they are | |
103 for(my $i=0; $i < $#subject_files + 1; $i++) { | |
104 my $size = (-s $subject_files[$i]); # -s performed on an unopened file! | |
105 | |
106 push(@file_sizes, ["$subject_files[$i]", "$size", "$flanking_n[$i]", "$min_occur[$i]", "$ids[$i]"]); | |
107 } | |
108 | |
109 #### Sort file sizes array by file sizes | |
110 # ARRAY NOT SORTED BUT COPIED TO ALLOW SORTING AT A LATER DATE | |
111 # @sorted_file_sizes = sort{$a->[1] <=> $b->[1]} @file_sizes; | |
112 push(@sorted_file_sizes, @file_sizes); | |
113 | |
114 #### Summary file information | |
115 print STDOUT "TFBScluster analysis:\n", | |
116 "--------------------\n"; | |
117 | |
118 print STDOUT "TFBS library information:\n"; | |
119 | |
120 #### Show file summary | |
121 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { | |
122 print STDOUT "TFBS lib. ID = $sorted_file_sizes[$i][4].\n", | |
123 "Extended conservation = $sorted_file_sizes[$i][2].\n", | |
124 "Minimum occurrence = $sorted_file_sizes[$i][3].\n\n"; | |
125 } | |
126 | |
127 #print STDOUT "\n"; | |
128 | |
129 | |
130 ##################################################### | |
131 #### Information required by tree searching algorithm | |
132 ##################################################### | |
133 | |
134 #### Array containing the minimum number of each TF, also corresp names | |
135 my @tf_min = (); | |
136 my @tf_names = (); | |
137 | |
138 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { | |
139 push(@tf_min, $sorted_file_sizes[$i][3]); | |
140 } | |
141 | |
142 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { | |
143 push(@tf_names, $sorted_file_sizes[$i][4]); | |
144 } | |
145 | |
146 #### TEST | |
147 #print "ARRAY1 = @tf_min\n\n"; | |
148 | |
149 ##################################################################### | |
150 #### Open a file to store all the TF data from each selected library. | |
151 ##################################################################### | |
152 $combined->open(">TFcombined\.$$") or die("Could not open TFcombined file!\n"); | |
153 | |
154 #### Copy each TF file to another file and sort it. | |
155 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { | |
156 #### Save necessary parts of the current subject file to chr. specific arrays | |
157 $subject->open("<$sorted_file_sizes[$i][0]") or die("Could not open subject file!\n"); | |
158 | |
159 #### Message to user | |
160 #print "Adding data for TF file $i\.\t"; | |
161 | |
162 SUBLINE: while(defined(my $sub_line = <$subject>)) { | |
163 my ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score, | |
164 $sub_strand, $sub_frame, $sub_attribute) = ''; | |
165 | |
166 #### Skip line if GFF comment or blank line | |
167 if($sub_line =~ /(^\s|^#)/) | |
168 { | |
169 next SUBLINE; | |
170 } | |
171 | |
172 #### Split each line by TAB | |
173 ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score, | |
174 $sub_strand, $sub_frame, $sub_attribute) = split(/\t/, $sub_line, 9); | |
175 | |
176 #### Clean up attribute | |
177 #($sub_attribute) = $sub_attribute =~ /[ACTG\-]+/g; | |
178 chomp($sub_attribute); | |
179 | |
180 #### Adjust thresold of subject positions to reflect the core sequence | |
181 #### Possibly make an argument?! | |
182 $sub_start = $sub_start + $sorted_file_sizes[$i][2]; | |
183 $sub_end = $sub_end - $sorted_file_sizes[$i][2]; | |
184 | |
185 #### Determine chromosome | |
186 (my $sub_chr) = $sub_seqname; | |
187 | |
188 #### Add modified line to analysis library file | |
189 $combined->print("$sub_seqname\t$sub_source\t$sub_feature\t$sub_start\t$sub_end\t", | |
190 "$sub_score\t$sub_strand\t$sub_frame\t$sorted_file_sizes[$i][4]\n"); | |
191 # "$sub_score\t$sub_strand\t$sub_frame\t$sub_attribute", | |
192 # "\_\_$sorted_file_sizes[$i][4]\n"); | |
193 } | |
194 | |
195 #### Message | |
196 #print STDOUT "\[DONE\]\n"; | |
197 | |
198 $subject->close; | |
199 } | |
200 | |
201 #### Spacer on screen | |
202 #print STDOUT "\n"; | |
203 | |
204 $combined->close; | |
205 | |
206 | |
207 ############################# | |
208 #### Sort the TFcombined file | |
209 ############################# | |
210 #print STDOUT "Sorting TFcombined file.\t"; | |
211 | |
212 #system("sort +0.3 -0.5 +3n -4n TFcombined\.$$ > TFcombined_sorted_temp\.$$"); | |
213 system("sort +0 -1 +3n -4n TFcombined\.$$ > TFcombined_sorted\.$$"); | |
214 | |
215 #print "HELLO\n"; | |
216 | |
217 ################################### | |
218 #### Convert all chr01 back to chr1 | |
219 ################################### | |
220 #system("/home/donaldson/bin/TFBScluster/nozero_before_1-9.pl TFcombined_sorted_temp\.$$ TFcombined_sorted\.$$"); | |
221 | |
222 #print STDOUT "\[DONE\]\n\n"; | |
223 | |
224 | |
225 ##################################### | |
226 #### Sort the sorted file into groups | |
227 ##################################### | |
228 #### Work thru each line of the combined TF file. Store record of all TFs downstream | |
229 #### WITHIN the predefined distance | |
230 #print STDOUT "Organising the sorted file into groups.\t"; | |
231 | |
232 my $last = 0; | |
233 | |
234 $sorted->open("<TFcombined_sorted\.$$") or die("Could not open sorted TFcombined file!\n"); | |
235 | |
236 #### Rewind combined TF file to start | |
237 seek($sorted, 0, 0); | |
238 | |
239 COMBLINE: while(defined(my $comb_line = <$sorted>)) { | |
240 #### Get info about the line | |
241 my @comb_line_array = split(/\t/, $comb_line); | |
242 my $comb_seqname = $comb_line_array[0]; | |
243 (my $comb_chr) = $comb_seqname; | |
244 | |
245 my $comb_start = $comb_line_array[3]; | |
246 | |
247 #### Store the start of the next line | |
248 my $next_line_pos = tell($sorted); | |
249 | |
250 #### Variable to hold lines | |
251 my $group_holder = ''; | |
252 | |
253 $group_holder = $comb_line; | |
254 | |
255 #### Read thru the next lines until the end position is not within the specified | |
256 #### range of the start line start | |
257 my $count_hit = 0; | |
258 | |
259 GROUPLINE: while(defined(my $group_line = <$sorted>)) { | |
260 my @group_line_array = split(/\t/, $group_line); | |
261 my $group_seqname = $group_line_array[0]; | |
262 (my $group_chr) = $group_seqname; | |
263 | |
264 my $group_end = $group_line_array[4]; | |
265 | |
266 #### CHR | |
267 #if( (($group_end - $comb_start + 1) < $range ) and ($comb_chr eq $group_chr) ) { | |
268 if( (($group_end - $comb_start + 1) <= $range ) and ($comb_chr eq $group_chr) ) { | |
269 $group_holder .= $group_line; | |
270 $count_hit++; | |
271 } | |
272 | |
273 else { | |
274 last GROUPLINE; | |
275 } | |
276 } | |
277 | |
278 if($count_hit > 0) { | |
279 #Make the record | |
280 $groups->open(">>TFgroups\.$$") or die("Could not open TF groups file!\n"); | |
281 | |
282 $groups->print("$group_holder\/\/\n"); | |
283 | |
284 $groups->close; | |
285 | |
286 #### Move to the end of the last line | |
287 seek($sorted, $next_line_pos , 0); | |
288 | |
289 next COMBLINE; | |
290 } | |
291 | |
292 else { | |
293 #### Move to the end of the last line | |
294 seek($sorted, $next_line_pos , 0); | |
295 | |
296 next COMBLINE; | |
297 } | |
298 } | |
299 | |
300 $sorted->close; | |
301 | |
302 #print STDOUT "\[DONE\]\n\n"; | |
303 | |
304 | |
305 ################################################################################### | |
306 #### Look through the groups file to find records matching the user defined params | |
307 ################################################################################### | |
308 my $record = ''; | |
309 my $target = 0; | |
310 my $count_pass = 0; | |
311 | |
312 #### Another user message | |
313 print STDOUT "You have chosen to search for groups containing at least:\n"; | |
314 | |
315 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { | |
316 print STDOUT "$sorted_file_sizes[$i][3] $sorted_file_sizes[$i][4] site(s).\n"; | |
317 | |
318 #### Increment the desired number of matches for a group to be selected | |
319 $target++; | |
320 } | |
321 | |
322 #### Another user message | |
323 #print STDOUT "\nOutput will be written to \[$ARGV[6]\].\n"; | |
324 print STDOUT "\nCombining overlapping clusters:\n"; | |
325 | |
326 #### Open an output file | |
327 $output->open(">$ARGV[6]\_v1") or die("Could not open output file!\n"); | |
328 | |
329 #### Open the TFgroups files | |
330 $groups->open("<TFgroups\.$$") or die("Could not open TFgroups file!\n"); | |
331 | |
332 #### Open the filtered record test file | |
333 $filtered_groups->open(">filtered_groups\.$$") or die("Could not open filtered groups file!\n"); | |
334 | |
335 #### Take each record of the groups file | |
336 RECORD: while($record = GetNewRecord($groups)) { | |
337 #### What about if the positions overlap? | |
338 my @record_array = (); | |
339 my $last_record_start = 0; | |
340 my $last_record_end = 0; | |
341 my $last_record_attribute = 0; | |
342 | |
343 my $save_filtered_group = ''; | |
344 | |
345 #### Take each line of the record beginning with chr... | |
346 RECORDLINE: while($record =~ /(\w.+\n)/mg) { | |
347 my $record_line = $1; | |
348 | |
349 my @record_line_array = (); | |
350 @record_line_array = split(/\t/, $record_line); | |
351 | |
352 my $record_seqname = $record_line_array[0]; | |
353 my $record_start = $record_line_array[3]; | |
354 my $record_end = $record_line_array[4]; | |
355 my $record_strand = $record_line_array[6]; | |
356 my $record_attribute = $record_line_array[8]; | |
357 chomp($record_attribute); | |
358 | |
359 #print "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n"; | |
360 #exit; | |
361 | |
362 #### If the last motif exactly overlaps and is same ID - skip adding to array | |
363 if( ($record_start == $last_record_start) and | |
364 ($record_end == $last_record_end) and | |
365 ($record_attribute eq $last_record_attribute) ) { | |
366 | |
367 $last_record_start = $record_start; | |
368 $last_record_end = $record_end; | |
369 $last_record_attribute = $record_attribute; | |
370 | |
371 next RECORDLINE; | |
372 } | |
373 | |
374 $last_record_start = $record_start; | |
375 $last_record_end = $record_end; | |
376 $last_record_attribute = $record_attribute; | |
377 | |
378 #### File 2D array | |
379 push(@record_array, ["$record_seqname", "$record_start", "$record_end", "$record_strand", "$record_attribute"]); | |
380 | |
381 #### Test file | |
382 $save_filtered_group .= "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n"; | |
383 } | |
384 | |
385 #### Test file record marker | |
386 $save_filtered_group .= "//\n"; | |
387 | |
388 ###################################################################### | |
389 #### Make sure the record contains the minimum number of specified TFs | |
390 ###################################################################### | |
391 | |
392 #### Counter to see whether all params matched | |
393 my $pass = 0; | |
394 @regex_ids = (); # Array to hold regexs as they are used | |
395 my @regex_totals = (); # Array to hold info on regex totals in the record | |
396 | |
397 #### Work thru each of the input parameter lists | |
398 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) { | |
399 #### Site name for regex | |
400 my $regex = $sorted_file_sizes[$i][4]; | |
401 | |
402 push(@regex_ids, $regex); | |
403 | |
404 #### Min number of hits for regex | |
405 my $min_regex = $sorted_file_sizes[$i][3]; | |
406 | |
407 #### Search for current regex and tally | |
408 my $regex_hits = 0; | |
409 | |
410 #### Work thru each of the non-repeating record lines array | |
411 for(my $k=0; $k < $#record_array + 1; $k++) { | |
412 my $line_regex = $record_array[$k][4]; | |
413 | |
414 if($regex eq $line_regex) { | |
415 $regex_hits++; | |
416 } | |
417 } | |
418 | |
419 #### Were the min number of regex hits found? | |
420 if($regex_hits >= $min_regex) { | |
421 $pass++; | |
422 } | |
423 } | |
424 | |
425 ##################################################################### | |
426 #### If there are the minimum number of TFBS then check they are not | |
427 #### sufficiently overlapping to reduce the numbers below the minimum | |
428 ##################################################################### | |
429 my $good_cluster = ''; | |
430 | |
431 if($pass == $target) { | |
432 #### Test file | |
433 $filtered_groups->print("$save_filtered_group"); | |
434 | |
435 # Declarations | |
436 my ($end); | |
437 my (@starts, @ends, @choices); | |
438 | |
439 # Assign start and end positions | |
440 | |
441 grep { $starts[$_] = $record_array[$_][1]; $ends[$_] = $record_array[$_][2] } | |
442 (0 .. $#record_array); | |
443 | |
444 $end = -1; # First choice does not refer to a transcription factor | |
445 | |
446 # Loop through all transcription factors | |
447 | |
448 I_LOOP: | |
449 for my $i (0 .. @starts) { | |
450 my $next_factor = 0; | |
451 | |
452 # Now loop through all possible following transcription factors | |
453 # Note that $starts[0] and @ends[0] refer to the first transcription factor | |
454 # whereas $choices[1] will refer to the first transcription factor. | |
455 | |
456 for my $j ( $i .. $#starts) { | |
457 if ($starts[$j] > $end) { | |
458 $next_factor = ($j + 1); | |
459 last; | |
460 } | |
461 } | |
462 | |
463 push @{ $choices[$i] }, $next_factor; | |
464 | |
465 # If no factor follows, we have a terminating factor and progress to the next. | |
466 # Must first modify $end to be the end of the next transcription factor. | |
467 | |
468 unless ($next_factor) { | |
469 $end = $ends[$i]; | |
470 next I_LOOP; | |
471 } | |
472 | |
473 # Now need to check the factors overlapping with $next_factor and add them | |
474 # as possibilities. Note that in some circumstances, this may result in a | |
475 # redundant path. This will not give spurious results, however. | |
476 | |
477 for my $k ( $next_factor .. $#starts ) { | |
478 if ($starts[$k] <= $ends[$next_factor - 1]) { | |
479 push @{ $choices[$i] }, ($k + 1); | |
480 } | |
481 } | |
482 | |
483 # Finally, modify $end to be the end of the next transcription factor | |
484 | |
485 $end = $ends[$i]; | |
486 | |
487 # And go to next loop | |
488 } | |
489 | |
490 | |
491 #### Print out @choices array to file | |
492 foreach (@choices) { | |
493 $filtered_groups->print("@{ $_ } \n"); | |
494 } | |
495 | |
496 $filtered_groups->print("//\n"); | |
497 | |
498 | |
499 ##################################################### | |
500 #### Information required by tree searching algorithm | |
501 #### Only for records that contain min number of TFBS | |
502 ##################################################### | |
503 | |
504 #### Array relating each TF to their minimum values | |
505 my @tf_relate = (); | |
506 | |
507 #$tf_relate[0] = 0; | |
508 | |
509 #### Get TF ID for current record | |
510 for(my $i=0; $i < $#record_array+1; $i++) { | |
511 #my $next_i = $i + 1; | |
512 | |
513 my $current_attrib = $record_array[$i][4]; | |
514 | |
515 #### Scan @sorted_files_sizes array for matching TF ID and save | |
516 #### row number. This will relate directly to @tf_min | |
517 for(my $f=0; $f < $#sorted_file_sizes + 1; $f++) { | |
518 | |
519 if($current_attrib =~ /$sorted_file_sizes[$f][4]/) { | |
520 #$tf_relate[$next_i] = "$f"; | |
521 $tf_relate[$i] = "$f"; | |
522 } | |
523 } | |
524 } | |
525 | |
526 #### TEST | |
527 # print "ARRAY2 = @tf_relate\n\n"; | |
528 # for(0..$#tf_relate) { | |
529 # print "[$_] $tf_relate[$_]\n"; | |
530 # } | |
531 | |
532 #### DUMMY ARRAYS | |
533 #@choices = (); | |
534 #@choices = ( | |
535 # [ 1,2,3 ], | |
536 # [ 4,5,6 ], | |
537 # [ 4,5,6 ], | |
538 # [ 4,5,6 ], | |
539 # [ 7 ], | |
540 # [ 7 ], | |
541 # [ 7 ], | |
542 # [ 8,9,10 ], | |
543 # [ 0 ], | |
544 # [ 0 ], | |
545 # [ 0 ] | |
546 # ); | |
547 | |
548 #@tf_min = (); | |
549 #@tf_min = ( 3, 1, 1); | |
550 | |
551 #@tf_relate = (); | |
552 #@tf_relate = (0,0,0,1,1,1,2,2,2,2); | |
553 | |
554 | |
555 | |
556 #### References for test_cluster sub | |
557 my $choices_to_pass = \@choices; #### Tree decisions | |
558 my $required_to_pass = \@tf_min; #### Min TFBS numbers | |
559 my $order = \@tf_relate; #### Relate TFBSs to min numbers | |
560 | |
561 | |
562 ######################################## | |
563 #### Run tree searching algorithm (Mike) | |
564 ######################################## | |
565 if($allow eq 'exclude') { | |
566 #$good_cluster = tf_cluster_tree($choices_to_pass, $required_to_pass, $order); | |
567 $good_cluster = tf_cluster_tree(\@choices, \@tf_min, \@tf_relate); | |
568 | |
569 #print "GOODCLUSTER = $good_cluster\n"; | |
570 } | |
571 | |
572 else { | |
573 $good_cluster = 1; | |
574 } | |
575 } | |
576 | |
577 | |
578 #### If all parameters are matched create a summary of the record | |
579 #### Work thru each line of the record string start end and TF ID to an array | |
580 | |
581 #### Carry on if overlapping not a problem #### | |
582 if( ($pass == $target) and ($good_cluster == 1) ){ | |
583 my $regex_chr = $record_array[0][0]; | |
584 my $regex_start = $record_array[0][1]; | |
585 my $regex_end = $record_array[$#record_array][2]; | |
586 my $joined_ids = join("-", @regex_ids); | |
587 | |
588 $output->print("$regex_chr\t", | |
589 "TFBScluster\t", | |
590 "CNS\t", | |
591 "$regex_start\t", | |
592 "$regex_end\t", | |
593 ".\t", | |
594 "+\t", | |
595 ".\t", | |
596 "$joined_ids\n"); | |
597 } | |
598 } | |
599 | |
600 $groups->close; | |
601 $filtered_groups->close; | |
602 $output->close; | |
603 | |
604 #### Space on screen | |
605 #print STDOUT "\n"; | |
606 | |
607 #### Read each line of output file and combine lines that overlap | |
608 my $version = 1; | |
609 | |
610 #### Remain in loop until last command given | |
611 while(1) { | |
612 my $last_seqname = ''; | |
613 my $last_chr = ''; | |
614 my $last_start = ''; | |
615 my $last_end = ''; | |
616 my $changes = 0; | |
617 my $outline_count = 0; | |
618 | |
619 my ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score, | |
620 $out_strand, $out_frame, $out_attribute, $out_chr) = ''; | |
621 | |
622 $output->open("<$ARGV[6]\_v$version") or die("Could not open output file!\n"); | |
623 | |
624 #### If the output file is empty then exit loop and finish | |
625 if(-z $output) { | |
626 $output->close; | |
627 | |
628 system("rm $ARGV[6]\_v$version"); | |
629 | |
630 exit; | |
631 #die("\nNo clusters were found!\n"); | |
632 } | |
633 | |
634 $version++; | |
635 | |
636 $output2->open(">$ARGV[6]\_v$version") or die("Could not open output2 file!\n"); | |
637 | |
638 OUTLINE: while(defined(my $out_line = <$output>)) { | |
639 #### Skip line if GFF comment or blank line | |
640 if($out_line =~ /^\s/) | |
641 { | |
642 next OUTLINE; | |
643 } | |
644 | |
645 #### Tally lines read | |
646 $outline_count++; | |
647 | |
648 ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score, | |
649 $out_strand, $out_frame, $out_attribute) = split(/\t/, $out_line, 9); | |
650 | |
651 $out_chr = $out_seqname; | |
652 | |
653 #### Handle the first line | |
654 if($outline_count == 1) { | |
655 $last_seqname = $out_seqname; | |
656 $last_chr = $out_chr; | |
657 $last_start = $out_start; | |
658 $last_end = $out_end; | |
659 | |
660 next OUTLINE; | |
661 } | |
662 | |
663 #### Remaining lines | |
664 | |
665 #### If the patterns are on different chromosomes | |
666 #### CHR | |
667 if($last_chr ne $out_chr) { | |
668 #### Print the last line to the file and save the current | |
669 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t", | |
670 "$out_score\t$out_strand\t$out_frame\t$out_attribute"); | |
671 | |
672 $last_seqname = $out_seqname; | |
673 $last_chr = $out_chr; | |
674 $last_start = $out_start; | |
675 $last_end = $out_end; | |
676 | |
677 next OUTLINE; | |
678 } | |
679 | |
680 #### If they overlap change current line start to the previous | |
681 if( ($last_end > $out_start) and ($last_end <= $out_end) ) { | |
682 $last_end = $out_end; | |
683 | |
684 #### Record the number of changes | |
685 $changes++; | |
686 } | |
687 | |
688 #### If not just print to output | |
689 else { | |
690 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t", | |
691 "$out_score\t$out_strand\t$out_frame\t$out_attribute"); | |
692 | |
693 $last_seqname = $out_seqname; | |
694 $last_chr = $out_chr; | |
695 $last_start = $out_start; | |
696 $last_end = $out_end; | |
697 } | |
698 } | |
699 | |
700 #### Print last line to outfile | |
701 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t", | |
702 "$out_score\t$out_strand\t$out_frame\t$out_attribute"); | |
703 | |
704 #### Message | |
705 my $previous = $version - 1; | |
706 | |
707 print STDOUT "Records in output file version $previous \($outline_count patterns\).\n"; | |
708 | |
709 | |
710 $output->close; | |
711 $output2->close; | |
712 | |
713 if($changes == 0) { | |
714 my $joined_ids = join("-", @regex_ids); | |
715 #### Copy last version to file name without v number | |
716 #system("cp $ARGV[6]\_v$version $ARGV[6]"); | |
717 | |
718 #### Open final output file and convert the attribute | |
719 open(FINAL_VER, "<$ARGV[6]\_v$version") or die("Could not open final ver GFF!\n"); | |
720 open(FINAL, ">$ARGV[6]") or die("Could not open final GFF!\n"); | |
721 | |
722 while(defined(my $final_line = <FINAL_VER>)) { | |
723 | |
724 my ($final_seqname, $final_source, $final_feature, $final_start, $final_end, | |
725 $final_score, $final_strand, $final_frame, $final_attribute) = | |
726 split(/\t/, $final_line, 9); | |
727 | |
728 my $pattern_length = ($final_end - $final_start) + 1; | |
729 | |
730 print FINAL "$final_seqname\t$final_source\t$final_feature\t$final_start\t", | |
731 "$final_end\t$final_score\t$final_strand\t$final_frame\t", | |
732 "$joined_ids\_len$pattern_length\n"; | |
733 | |
734 } | |
735 | |
736 close(FINAL_VER); | |
737 close(FINAL); | |
738 | |
739 last; | |
740 } | |
741 } | |
742 | |
743 #### Spacer for summary file | |
744 print STDOUT "\n"; | |
745 | |
746 #### Remove intermediate files | |
747 system("rm ./TFcombined\.$$ ./TFcombined_sorted\.$$ ./TFgroups\.$$ ./filtered_groups\.$$ $ARGV[6]\_v*"); | |
748 | |
749 exit; | |
750 | |
751 | |
752 ############ | |
753 #Subroutines | |
754 ############ | |
755 sub GetNewRecord | |
756 # Load record from a library file, delimited by // | |
757 { | |
758 my $fh = shift (@_); | |
759 my $record = ''; | |
760 my $saveinputsep = $/; | |
761 $/ = "//\n"; | |
762 | |
763 $record = <$fh>; | |
764 | |
765 $/ = $saveinputsep; | |
766 return $record; | |
767 } | |
768 | |
769 sub tf_cluster_tree { | |
770 | |
771 my @path; | |
772 my $node_count; | |
773 my $node = 0; | |
774 my $success = 0; | |
775 my $choice = -1; | |
776 my @node_contains; | |
777 my @node_to_choice; | |
778 my @count; | |
779 my @branches; | |
780 $branches[0] = -1; | |
781 my $path_ref; | |
782 my $branch_ref; | |
783 my $node_contains; | |
784 my $node_to_choice; | |
785 my ($choice_ref, | |
786 $required_ref, | |
787 $tf_ref) = @_; | |
788 my @choices = @$choice_ref; | |
789 my @required = @$required_ref; | |
790 my @tfs = @$tf_ref; | |
791 ($node, | |
792 $choice, | |
793 $node_count, | |
794 $choice_ref, | |
795 $path_ref, | |
796 $branch_ref, | |
797 $node_to_choice, | |
798 $node_contains) = next_node ($node, | |
799 $choice, | |
800 $node_count, | |
801 $choice_ref, | |
802 \@path, | |
803 \@branches, | |
804 $tf_ref); | |
805 @choices = @$choice_ref; | |
806 @path = @$path_ref; | |
807 @branches = @$branch_ref; | |
808 $node_contains[$node] = $node_contains; | |
809 $node_to_choice[$node] = $node_to_choice; | |
810 | |
811 BLOCK_A: | |
812 | |
813 while (1) { | |
814 no strict "vars"; | |
815 | |
816 if ( node_terminating($choice, $choice_ref) ) { | |
817 push @path, $node; | |
818 @count = undef; | |
819 grep { $count[ $node_contains[$_] ]++ } @path; | |
820 #print "Count is @count.\n"; | |
821 my $score = grep { $count[$_] >= $required[$_] } | |
822 (0 .. $#count); | |
823 #print "Path is @path\n"; | |
824 if ($score == scalar @required) { | |
825 $success = 1; | |
826 last BLOCK_A; | |
827 } | |
828 pop @path; | |
829 ($node, | |
830 $choice, | |
831 $path_ref) | |
832 = last_unexplored_node(\@path, $choice, $choice_ref, | |
833 \@node_to_choice, \@branches); | |
834 last BLOCK_A if ($node == -1); | |
835 @path = @$path_ref; | |
836 } | |
837 if ( node_fully_explored($choice, $node, $choice_ref, \@branches) ) { | |
838 ($node, | |
839 $choice, | |
840 $path_ref) | |
841 = last_unexplored_node(\@path, $choice, $choice_ref, | |
842 \@node_to_choice, \@branches); | |
843 @path = @$path_ref; | |
844 last BLOCK_A if ($node == -1); | |
845 } | |
846 ($node, | |
847 $choice, | |
848 $node_count, | |
849 $choice_ref, | |
850 $path_ref, | |
851 $branch_ref, | |
852 $node_to_choice, | |
853 $node_contains,) = next_node ($node, | |
854 $choice, | |
855 $node_count, | |
856 $choice_ref, | |
857 \@path, | |
858 \@branches, | |
859 $tf_ref); | |
860 @choices = @$choice_ref; | |
861 @path = @$path_ref; | |
862 @branches = @$branch_ref; | |
863 $node_contains[$node] = $node_contains; | |
864 $node_to_choice[$node] = $node_to_choice; | |
865 } | |
866 return $success; | |
867 } | |
868 | |
869 | |
870 sub next_node { | |
871 | |
872 my ($node, | |
873 my $choice, | |
874 my $node_count, | |
875 my $choice_ref, | |
876 my $path_ref, | |
877 my $branch_ref, | |
878 my $tf_ref) = @_; | |
879 my @choices = @$choice_ref; | |
880 my @path = @$path_ref; | |
881 my @branches = @$branch_ref; | |
882 my @tfs = @$tf_ref; | |
883 my $new_choice = $choices[$choice][0]; | |
884 push @{ $choices[$choice] }, shift @{ $choices[$choice] }; | |
885 $choice = $new_choice; | |
886 my $node_to_choice = $choice; | |
887 push @path, $node if $node; | |
888 $branches[$node]++; | |
889 $node = $node_count++; | |
890 my $node_contains = $tfs[$choice - 1] if $choice; | |
891 return ( | |
892 $node, | |
893 $choice, | |
894 $node_count, | |
895 \@choices, | |
896 \@path, | |
897 \@branches, | |
898 $node_to_choice, | |
899 $node_contains); | |
900 } | |
901 | |
902 sub node_fully_explored { | |
903 no strict "vars"; | |
904 | |
905 my $choice = shift; | |
906 my $node = shift; | |
907 my $choices_ref = shift; | |
908 my $branch_ref = shift; | |
909 my @choices = @$choices_ref; | |
910 my @branches = @$branch_ref; | |
911 if ($branches[$node] == (scalar @{ $choices[$choice] })) { | |
912 return 1 } | |
913 else { return 0 } | |
914 } | |
915 | |
916 sub last_unexplored_node { | |
917 no strict "vars"; | |
918 | |
919 my $path_ref = shift; | |
920 my $choice = shift; | |
921 my $choice_ref = shift; | |
922 my $node_to_choice_ref = shift; | |
923 my $branch_ref = shift; | |
924 my @path = @$path_ref; | |
925 my @node_to_choice = @$node_to_choice_ref; | |
926 my @branches = @$branch_ref; | |
927 my $node; | |
928 | |
929 do { | |
930 $node = pop @path; | |
931 $choice = $node_to_choice[$node]; | |
932 if ( $node == 0 and node_fully_explored($choice, $node, | |
933 $choice_ref, \@branches) ) { | |
934 $node = -1; | |
935 last; | |
936 } | |
937 } while ( node_fully_explored($choice, $node, $choice_ref, | |
938 \@branches) ); | |
939 return ($node, $choice, \@path); | |
940 } | |
941 | |
942 sub node_terminating { | |
943 | |
944 my $choice = shift; | |
945 my $choices_ref = shift; | |
946 my @choices = @$choices_ref; | |
947 if ($choices[$choice][0]) { return 0 } | |
948 else { return 1 } | |
949 | |
950 } | |
951 | |
952 | |
953 | |
954 | |
955 | |
956 | |
957 | |
958 | |
959 | |
960 | |
961 | |
962 | |
963 | |
964 |