comparison TFBScluster_candidates.pl @ 3:b4e22d2d7fa7 draft

Add missing Perl scripts from previous uploads.
author pjbriggs
date Mon, 09 Apr 2018 04:45:06 -0400
parents
children
comparison
equal deleted inserted replaced
2:96a4ee27b08a 3:b4e22d2d7fa7
1 #!/usr/bin/perl
2
3 # TFBScluster version 2.0 - cluster together TFBSs from distinct
4 # TFBSsearch generated libraries.
5 #
6 # (c) Ian Donaldson 2003 and Mike Chapman (TFBS overlap subs)
7 #
8 # This program is free software; you can redistribute it and/or
9 # modify it under the terms of the GNU General Public License
10 # as published by the Free Software Foundation; either version 2
11 # of the License, or (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program; if not, write to the Free Software
20 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21
22 use strict;
23 use FileHandle;
24 #use integer; # Force floating point into integers
25
26 $|=1;
27
28 ###########
29 # Program to determine whether a TF site is in close proximity to
30 # one or more other TF sites.
31 # September 2003. Ian Donaldson.
32 # Revised 8 Sept. 2003 to not include the query TF site in the threshold.
33 # This is to allow one to determine whether a TF is near to another of the
34 # same type.
35 # Revised 9 Sept to alter the threshold size to only include the core of a
36 # pattern i.e. gata of nngatann.
37 # Revised 19 Sept to replace query and subject libraries with the statement
38 # of all interested libraries.
39 # Revised 22 Sept to scan output for overlapping patterns.
40 # NOTE: Any overlap in comparative record start with start pattern
41 # Revised 30 Sept to ignore duplication of TFBS in a group record caused by
42 # palindrome. By skipping if name and positions the same.
43 # Revised 6 Oct code for tree searching method to deal with overlapping TFBSs
44 ###########
45
46 #### Command line usage
47 if(@ARGV != 7)
48 {
49 die ("USAGE:
50 $0
51 TF libraries \(comma delimited NO SPACES\)
52 Number of flanking 'N's for subject files \(comma delimited NO SPACES\)
53 Minimum number of occurences \(comma delimited NO SPACES\)
54 TF IDs \(comma delimited NO SPACES\)
55 Single range value in bp \(+/-\) query start and end values
56 Include overlapping TFBSs \(include/exclude\)
57 Output file\n\n");
58 }
59
60 #### File handles
61 my $subject = new FileHandle;
62 my $combined = new FileHandle;
63 my $sorted = new FileHandle;
64 my $groups = new FileHandle;
65 my $filtered_groups = new FileHandle;
66 my $output = new FileHandle;
67 my $output2 = new FileHandle;
68
69 #### Variables
70 my @subject_files = (); # Array containing the names of selected subject files
71 my @flanking_n = (); # Array containing the number of flanking 'n' for each pattern
72 my @min_occur = (); # Array containing the minimum occurences for the TF library
73 my @ids = (); # Array containing user defined IDs for the TF libraries
74 my @file_sizes = ();
75 my @sorted_file_sizes = ();
76 my $range = $ARGV[4];
77 my $allow = $ARGV[5];
78 my @regex_ids = ();
79
80 #####################################################
81 #### Deal with user arguments processed into an array
82 #####################################################
83
84 #### Convert TF file names string to an array
85 @subject_files = split(/,/, $ARGV[0]);
86
87 #### Convert flanking 'N' numbers string into an array
88 @flanking_n = split(/,/, $ARGV[1]);
89
90 #### Convert minimum occurences string into an array
91 @min_occur = split(/,/, $ARGV[2]);
92
93 #### Convert minimum occurences string into an array
94 @ids = split(/,/, $ARGV[3]);
95
96 foreach my $id_string (@ids) {
97 if($id_string !~ /^[\w\d_,]+$/) {
98 die("A non-letter/number character was detected in your label string!\n");
99 }
100 }
101
102 #### Record how large they are
103 for(my $i=0; $i < $#subject_files + 1; $i++) {
104 my $size = (-s $subject_files[$i]); # -s performed on an unopened file!
105
106 push(@file_sizes, ["$subject_files[$i]", "$size", "$flanking_n[$i]", "$min_occur[$i]", "$ids[$i]"]);
107 }
108
109 #### Sort file sizes array by file sizes
110 # ARRAY NOT SORTED BUT COPIED TO ALLOW SORTING AT A LATER DATE
111 # @sorted_file_sizes = sort{$a->[1] <=> $b->[1]} @file_sizes;
112 push(@sorted_file_sizes, @file_sizes);
113
114 #### Summary file information
115 print STDOUT "TFBScluster analysis:\n",
116 "--------------------\n";
117
118 print STDOUT "TFBS library information:\n";
119
120 #### Show file summary
121 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
122 print STDOUT "TFBS lib. ID = $sorted_file_sizes[$i][4].\n",
123 "Extended conservation = $sorted_file_sizes[$i][2].\n",
124 "Minimum occurrence = $sorted_file_sizes[$i][3].\n\n";
125 }
126
127 #print STDOUT "\n";
128
129
130 #####################################################
131 #### Information required by tree searching algorithm
132 #####################################################
133
134 #### Array containing the minimum number of each TF, also corresp names
135 my @tf_min = ();
136 my @tf_names = ();
137
138 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
139 push(@tf_min, $sorted_file_sizes[$i][3]);
140 }
141
142 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
143 push(@tf_names, $sorted_file_sizes[$i][4]);
144 }
145
146 #### TEST
147 #print "ARRAY1 = @tf_min\n\n";
148
149 #####################################################################
150 #### Open a file to store all the TF data from each selected library.
151 #####################################################################
152 $combined->open(">TFcombined\.$$") or die("Could not open TFcombined file!\n");
153
154 #### Copy each TF file to another file and sort it.
155 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
156 #### Save necessary parts of the current subject file to chr. specific arrays
157 $subject->open("<$sorted_file_sizes[$i][0]") or die("Could not open subject file!\n");
158
159 #### Message to user
160 #print "Adding data for TF file $i\.\t";
161
162 SUBLINE: while(defined(my $sub_line = <$subject>)) {
163 my ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score,
164 $sub_strand, $sub_frame, $sub_attribute) = '';
165
166 #### Skip line if GFF comment or blank line
167 if($sub_line =~ /(^\s|^#)/)
168 {
169 next SUBLINE;
170 }
171
172 #### Split each line by TAB
173 ($sub_seqname, $sub_source, $sub_feature, $sub_start, $sub_end, $sub_score,
174 $sub_strand, $sub_frame, $sub_attribute) = split(/\t/, $sub_line, 9);
175
176 #### Clean up attribute
177 #($sub_attribute) = $sub_attribute =~ /[ACTG\-]+/g;
178 chomp($sub_attribute);
179
180 #### Adjust thresold of subject positions to reflect the core sequence
181 #### Possibly make an argument?!
182 $sub_start = $sub_start + $sorted_file_sizes[$i][2];
183 $sub_end = $sub_end - $sorted_file_sizes[$i][2];
184
185 #### Determine chromosome
186 (my $sub_chr) = $sub_seqname;
187
188 #### Add modified line to analysis library file
189 $combined->print("$sub_seqname\t$sub_source\t$sub_feature\t$sub_start\t$sub_end\t",
190 "$sub_score\t$sub_strand\t$sub_frame\t$sorted_file_sizes[$i][4]\n");
191 # "$sub_score\t$sub_strand\t$sub_frame\t$sub_attribute",
192 # "\_\_$sorted_file_sizes[$i][4]\n");
193 }
194
195 #### Message
196 #print STDOUT "\[DONE\]\n";
197
198 $subject->close;
199 }
200
201 #### Spacer on screen
202 #print STDOUT "\n";
203
204 $combined->close;
205
206
207 #############################
208 #### Sort the TFcombined file
209 #############################
210 #print STDOUT "Sorting TFcombined file.\t";
211
212 #system("sort +0.3 -0.5 +3n -4n TFcombined\.$$ > TFcombined_sorted_temp\.$$");
213 system("sort +0 -1 +3n -4n TFcombined\.$$ > TFcombined_sorted\.$$");
214
215 #print "HELLO\n";
216
217 ###################################
218 #### Convert all chr01 back to chr1
219 ###################################
220 #system("/home/donaldson/bin/TFBScluster/nozero_before_1-9.pl TFcombined_sorted_temp\.$$ TFcombined_sorted\.$$");
221
222 #print STDOUT "\[DONE\]\n\n";
223
224
225 #####################################
226 #### Sort the sorted file into groups
227 #####################################
228 #### Work thru each line of the combined TF file. Store record of all TFs downstream
229 #### WITHIN the predefined distance
230 #print STDOUT "Organising the sorted file into groups.\t";
231
232 my $last = 0;
233
234 $sorted->open("<TFcombined_sorted\.$$") or die("Could not open sorted TFcombined file!\n");
235
236 #### Rewind combined TF file to start
237 seek($sorted, 0, 0);
238
239 COMBLINE: while(defined(my $comb_line = <$sorted>)) {
240 #### Get info about the line
241 my @comb_line_array = split(/\t/, $comb_line);
242 my $comb_seqname = $comb_line_array[0];
243 (my $comb_chr) = $comb_seqname;
244
245 my $comb_start = $comb_line_array[3];
246
247 #### Store the start of the next line
248 my $next_line_pos = tell($sorted);
249
250 #### Variable to hold lines
251 my $group_holder = '';
252
253 $group_holder = $comb_line;
254
255 #### Read thru the next lines until the end position is not within the specified
256 #### range of the start line start
257 my $count_hit = 0;
258
259 GROUPLINE: while(defined(my $group_line = <$sorted>)) {
260 my @group_line_array = split(/\t/, $group_line);
261 my $group_seqname = $group_line_array[0];
262 (my $group_chr) = $group_seqname;
263
264 my $group_end = $group_line_array[4];
265
266 #### CHR
267 #if( (($group_end - $comb_start + 1) < $range ) and ($comb_chr eq $group_chr) ) {
268 if( (($group_end - $comb_start + 1) <= $range ) and ($comb_chr eq $group_chr) ) {
269 $group_holder .= $group_line;
270 $count_hit++;
271 }
272
273 else {
274 last GROUPLINE;
275 }
276 }
277
278 if($count_hit > 0) {
279 #Make the record
280 $groups->open(">>TFgroups\.$$") or die("Could not open TF groups file!\n");
281
282 $groups->print("$group_holder\/\/\n");
283
284 $groups->close;
285
286 #### Move to the end of the last line
287 seek($sorted, $next_line_pos , 0);
288
289 next COMBLINE;
290 }
291
292 else {
293 #### Move to the end of the last line
294 seek($sorted, $next_line_pos , 0);
295
296 next COMBLINE;
297 }
298 }
299
300 $sorted->close;
301
302 #print STDOUT "\[DONE\]\n\n";
303
304
305 ###################################################################################
306 #### Look through the groups file to find records matching the user defined params
307 ###################################################################################
308 my $record = '';
309 my $target = 0;
310 my $count_pass = 0;
311
312 #### Another user message
313 print STDOUT "You have chosen to search for groups containing at least:\n";
314
315 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
316 print STDOUT "$sorted_file_sizes[$i][3] $sorted_file_sizes[$i][4] site(s).\n";
317
318 #### Increment the desired number of matches for a group to be selected
319 $target++;
320 }
321
322 #### Another user message
323 #print STDOUT "\nOutput will be written to \[$ARGV[6]\].\n";
324 print STDOUT "\nCombining overlapping clusters:\n";
325
326 #### Open an output file
327 $output->open(">$ARGV[6]\_v1") or die("Could not open output file!\n");
328
329 #### Open the TFgroups files
330 $groups->open("<TFgroups\.$$") or die("Could not open TFgroups file!\n");
331
332 #### Open the filtered record test file
333 $filtered_groups->open(">filtered_groups\.$$") or die("Could not open filtered groups file!\n");
334
335 #### Take each record of the groups file
336 RECORD: while($record = GetNewRecord($groups)) {
337 #### What about if the positions overlap?
338 my @record_array = ();
339 my $last_record_start = 0;
340 my $last_record_end = 0;
341 my $last_record_attribute = 0;
342
343 my $save_filtered_group = '';
344
345 #### Take each line of the record beginning with chr...
346 RECORDLINE: while($record =~ /(\w.+\n)/mg) {
347 my $record_line = $1;
348
349 my @record_line_array = ();
350 @record_line_array = split(/\t/, $record_line);
351
352 my $record_seqname = $record_line_array[0];
353 my $record_start = $record_line_array[3];
354 my $record_end = $record_line_array[4];
355 my $record_strand = $record_line_array[6];
356 my $record_attribute = $record_line_array[8];
357 chomp($record_attribute);
358
359 #print "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n";
360 #exit;
361
362 #### If the last motif exactly overlaps and is same ID - skip adding to array
363 if( ($record_start == $last_record_start) and
364 ($record_end == $last_record_end) and
365 ($record_attribute eq $last_record_attribute) ) {
366
367 $last_record_start = $record_start;
368 $last_record_end = $record_end;
369 $last_record_attribute = $record_attribute;
370
371 next RECORDLINE;
372 }
373
374 $last_record_start = $record_start;
375 $last_record_end = $record_end;
376 $last_record_attribute = $record_attribute;
377
378 #### File 2D array
379 push(@record_array, ["$record_seqname", "$record_start", "$record_end", "$record_strand", "$record_attribute"]);
380
381 #### Test file
382 $save_filtered_group .= "$record_seqname\t$record_start\t$record_end\t$record_strand\t$record_attribute\n";
383 }
384
385 #### Test file record marker
386 $save_filtered_group .= "//\n";
387
388 ######################################################################
389 #### Make sure the record contains the minimum number of specified TFs
390 ######################################################################
391
392 #### Counter to see whether all params matched
393 my $pass = 0;
394 @regex_ids = (); # Array to hold regexs as they are used
395 my @regex_totals = (); # Array to hold info on regex totals in the record
396
397 #### Work thru each of the input parameter lists
398 for(my $i=0; $i < $#sorted_file_sizes + 1; $i++) {
399 #### Site name for regex
400 my $regex = $sorted_file_sizes[$i][4];
401
402 push(@regex_ids, $regex);
403
404 #### Min number of hits for regex
405 my $min_regex = $sorted_file_sizes[$i][3];
406
407 #### Search for current regex and tally
408 my $regex_hits = 0;
409
410 #### Work thru each of the non-repeating record lines array
411 for(my $k=0; $k < $#record_array + 1; $k++) {
412 my $line_regex = $record_array[$k][4];
413
414 if($regex eq $line_regex) {
415 $regex_hits++;
416 }
417 }
418
419 #### Were the min number of regex hits found?
420 if($regex_hits >= $min_regex) {
421 $pass++;
422 }
423 }
424
425 #####################################################################
426 #### If there are the minimum number of TFBS then check they are not
427 #### sufficiently overlapping to reduce the numbers below the minimum
428 #####################################################################
429 my $good_cluster = '';
430
431 if($pass == $target) {
432 #### Test file
433 $filtered_groups->print("$save_filtered_group");
434
435 # Declarations
436 my ($end);
437 my (@starts, @ends, @choices);
438
439 # Assign start and end positions
440
441 grep { $starts[$_] = $record_array[$_][1]; $ends[$_] = $record_array[$_][2] }
442 (0 .. $#record_array);
443
444 $end = -1; # First choice does not refer to a transcription factor
445
446 # Loop through all transcription factors
447
448 I_LOOP:
449 for my $i (0 .. @starts) {
450 my $next_factor = 0;
451
452 # Now loop through all possible following transcription factors
453 # Note that $starts[0] and @ends[0] refer to the first transcription factor
454 # whereas $choices[1] will refer to the first transcription factor.
455
456 for my $j ( $i .. $#starts) {
457 if ($starts[$j] > $end) {
458 $next_factor = ($j + 1);
459 last;
460 }
461 }
462
463 push @{ $choices[$i] }, $next_factor;
464
465 # If no factor follows, we have a terminating factor and progress to the next.
466 # Must first modify $end to be the end of the next transcription factor.
467
468 unless ($next_factor) {
469 $end = $ends[$i];
470 next I_LOOP;
471 }
472
473 # Now need to check the factors overlapping with $next_factor and add them
474 # as possibilities. Note that in some circumstances, this may result in a
475 # redundant path. This will not give spurious results, however.
476
477 for my $k ( $next_factor .. $#starts ) {
478 if ($starts[$k] <= $ends[$next_factor - 1]) {
479 push @{ $choices[$i] }, ($k + 1);
480 }
481 }
482
483 # Finally, modify $end to be the end of the next transcription factor
484
485 $end = $ends[$i];
486
487 # And go to next loop
488 }
489
490
491 #### Print out @choices array to file
492 foreach (@choices) {
493 $filtered_groups->print("@{ $_ } \n");
494 }
495
496 $filtered_groups->print("//\n");
497
498
499 #####################################################
500 #### Information required by tree searching algorithm
501 #### Only for records that contain min number of TFBS
502 #####################################################
503
504 #### Array relating each TF to their minimum values
505 my @tf_relate = ();
506
507 #$tf_relate[0] = 0;
508
509 #### Get TF ID for current record
510 for(my $i=0; $i < $#record_array+1; $i++) {
511 #my $next_i = $i + 1;
512
513 my $current_attrib = $record_array[$i][4];
514
515 #### Scan @sorted_files_sizes array for matching TF ID and save
516 #### row number. This will relate directly to @tf_min
517 for(my $f=0; $f < $#sorted_file_sizes + 1; $f++) {
518
519 if($current_attrib =~ /$sorted_file_sizes[$f][4]/) {
520 #$tf_relate[$next_i] = "$f";
521 $tf_relate[$i] = "$f";
522 }
523 }
524 }
525
526 #### TEST
527 # print "ARRAY2 = @tf_relate\n\n";
528 # for(0..$#tf_relate) {
529 # print "[$_] $tf_relate[$_]\n";
530 # }
531
532 #### DUMMY ARRAYS
533 #@choices = ();
534 #@choices = (
535 # [ 1,2,3 ],
536 # [ 4,5,6 ],
537 # [ 4,5,6 ],
538 # [ 4,5,6 ],
539 # [ 7 ],
540 # [ 7 ],
541 # [ 7 ],
542 # [ 8,9,10 ],
543 # [ 0 ],
544 # [ 0 ],
545 # [ 0 ]
546 # );
547
548 #@tf_min = ();
549 #@tf_min = ( 3, 1, 1);
550
551 #@tf_relate = ();
552 #@tf_relate = (0,0,0,1,1,1,2,2,2,2);
553
554
555
556 #### References for test_cluster sub
557 my $choices_to_pass = \@choices; #### Tree decisions
558 my $required_to_pass = \@tf_min; #### Min TFBS numbers
559 my $order = \@tf_relate; #### Relate TFBSs to min numbers
560
561
562 ########################################
563 #### Run tree searching algorithm (Mike)
564 ########################################
565 if($allow eq 'exclude') {
566 #$good_cluster = tf_cluster_tree($choices_to_pass, $required_to_pass, $order);
567 $good_cluster = tf_cluster_tree(\@choices, \@tf_min, \@tf_relate);
568
569 #print "GOODCLUSTER = $good_cluster\n";
570 }
571
572 else {
573 $good_cluster = 1;
574 }
575 }
576
577
578 #### If all parameters are matched create a summary of the record
579 #### Work thru each line of the record string start end and TF ID to an array
580
581 #### Carry on if overlapping not a problem ####
582 if( ($pass == $target) and ($good_cluster == 1) ){
583 my $regex_chr = $record_array[0][0];
584 my $regex_start = $record_array[0][1];
585 my $regex_end = $record_array[$#record_array][2];
586 my $joined_ids = join("-", @regex_ids);
587
588 $output->print("$regex_chr\t",
589 "TFBScluster\t",
590 "CNS\t",
591 "$regex_start\t",
592 "$regex_end\t",
593 ".\t",
594 "+\t",
595 ".\t",
596 "$joined_ids\n");
597 }
598 }
599
600 $groups->close;
601 $filtered_groups->close;
602 $output->close;
603
604 #### Space on screen
605 #print STDOUT "\n";
606
607 #### Read each line of output file and combine lines that overlap
608 my $version = 1;
609
610 #### Remain in loop until last command given
611 while(1) {
612 my $last_seqname = '';
613 my $last_chr = '';
614 my $last_start = '';
615 my $last_end = '';
616 my $changes = 0;
617 my $outline_count = 0;
618
619 my ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score,
620 $out_strand, $out_frame, $out_attribute, $out_chr) = '';
621
622 $output->open("<$ARGV[6]\_v$version") or die("Could not open output file!\n");
623
624 #### If the output file is empty then exit loop and finish
625 if(-z $output) {
626 $output->close;
627
628 system("rm $ARGV[6]\_v$version");
629
630 exit;
631 #die("\nNo clusters were found!\n");
632 }
633
634 $version++;
635
636 $output2->open(">$ARGV[6]\_v$version") or die("Could not open output2 file!\n");
637
638 OUTLINE: while(defined(my $out_line = <$output>)) {
639 #### Skip line if GFF comment or blank line
640 if($out_line =~ /^\s/)
641 {
642 next OUTLINE;
643 }
644
645 #### Tally lines read
646 $outline_count++;
647
648 ($out_seqname, $out_source, $out_feature, $out_start, $out_end, $out_score,
649 $out_strand, $out_frame, $out_attribute) = split(/\t/, $out_line, 9);
650
651 $out_chr = $out_seqname;
652
653 #### Handle the first line
654 if($outline_count == 1) {
655 $last_seqname = $out_seqname;
656 $last_chr = $out_chr;
657 $last_start = $out_start;
658 $last_end = $out_end;
659
660 next OUTLINE;
661 }
662
663 #### Remaining lines
664
665 #### If the patterns are on different chromosomes
666 #### CHR
667 if($last_chr ne $out_chr) {
668 #### Print the last line to the file and save the current
669 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
670 "$out_score\t$out_strand\t$out_frame\t$out_attribute");
671
672 $last_seqname = $out_seqname;
673 $last_chr = $out_chr;
674 $last_start = $out_start;
675 $last_end = $out_end;
676
677 next OUTLINE;
678 }
679
680 #### If they overlap change current line start to the previous
681 if( ($last_end > $out_start) and ($last_end <= $out_end) ) {
682 $last_end = $out_end;
683
684 #### Record the number of changes
685 $changes++;
686 }
687
688 #### If not just print to output
689 else {
690 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
691 "$out_score\t$out_strand\t$out_frame\t$out_attribute");
692
693 $last_seqname = $out_seqname;
694 $last_chr = $out_chr;
695 $last_start = $out_start;
696 $last_end = $out_end;
697 }
698 }
699
700 #### Print last line to outfile
701 $output2->print("$last_seqname\t$out_source\t$out_feature\t$last_start\t$last_end\t",
702 "$out_score\t$out_strand\t$out_frame\t$out_attribute");
703
704 #### Message
705 my $previous = $version - 1;
706
707 print STDOUT "Records in output file version $previous \($outline_count patterns\).\n";
708
709
710 $output->close;
711 $output2->close;
712
713 if($changes == 0) {
714 my $joined_ids = join("-", @regex_ids);
715 #### Copy last version to file name without v number
716 #system("cp $ARGV[6]\_v$version $ARGV[6]");
717
718 #### Open final output file and convert the attribute
719 open(FINAL_VER, "<$ARGV[6]\_v$version") or die("Could not open final ver GFF!\n");
720 open(FINAL, ">$ARGV[6]") or die("Could not open final GFF!\n");
721
722 while(defined(my $final_line = <FINAL_VER>)) {
723
724 my ($final_seqname, $final_source, $final_feature, $final_start, $final_end,
725 $final_score, $final_strand, $final_frame, $final_attribute) =
726 split(/\t/, $final_line, 9);
727
728 my $pattern_length = ($final_end - $final_start) + 1;
729
730 print FINAL "$final_seqname\t$final_source\t$final_feature\t$final_start\t",
731 "$final_end\t$final_score\t$final_strand\t$final_frame\t",
732 "$joined_ids\_len$pattern_length\n";
733
734 }
735
736 close(FINAL_VER);
737 close(FINAL);
738
739 last;
740 }
741 }
742
743 #### Spacer for summary file
744 print STDOUT "\n";
745
746 #### Remove intermediate files
747 system("rm ./TFcombined\.$$ ./TFcombined_sorted\.$$ ./TFgroups\.$$ ./filtered_groups\.$$ $ARGV[6]\_v*");
748
749 exit;
750
751
752 ############
753 #Subroutines
754 ############
755 sub GetNewRecord
756 # Load record from a library file, delimited by //
757 {
758 my $fh = shift (@_);
759 my $record = '';
760 my $saveinputsep = $/;
761 $/ = "//\n";
762
763 $record = <$fh>;
764
765 $/ = $saveinputsep;
766 return $record;
767 }
768
769 sub tf_cluster_tree {
770
771 my @path;
772 my $node_count;
773 my $node = 0;
774 my $success = 0;
775 my $choice = -1;
776 my @node_contains;
777 my @node_to_choice;
778 my @count;
779 my @branches;
780 $branches[0] = -1;
781 my $path_ref;
782 my $branch_ref;
783 my $node_contains;
784 my $node_to_choice;
785 my ($choice_ref,
786 $required_ref,
787 $tf_ref) = @_;
788 my @choices = @$choice_ref;
789 my @required = @$required_ref;
790 my @tfs = @$tf_ref;
791 ($node,
792 $choice,
793 $node_count,
794 $choice_ref,
795 $path_ref,
796 $branch_ref,
797 $node_to_choice,
798 $node_contains) = next_node ($node,
799 $choice,
800 $node_count,
801 $choice_ref,
802 \@path,
803 \@branches,
804 $tf_ref);
805 @choices = @$choice_ref;
806 @path = @$path_ref;
807 @branches = @$branch_ref;
808 $node_contains[$node] = $node_contains;
809 $node_to_choice[$node] = $node_to_choice;
810
811 BLOCK_A:
812
813 while (1) {
814 no strict "vars";
815
816 if ( node_terminating($choice, $choice_ref) ) {
817 push @path, $node;
818 @count = undef;
819 grep { $count[ $node_contains[$_] ]++ } @path;
820 #print "Count is @count.\n";
821 my $score = grep { $count[$_] >= $required[$_] }
822 (0 .. $#count);
823 #print "Path is @path\n";
824 if ($score == scalar @required) {
825 $success = 1;
826 last BLOCK_A;
827 }
828 pop @path;
829 ($node,
830 $choice,
831 $path_ref)
832 = last_unexplored_node(\@path, $choice, $choice_ref,
833 \@node_to_choice, \@branches);
834 last BLOCK_A if ($node == -1);
835 @path = @$path_ref;
836 }
837 if ( node_fully_explored($choice, $node, $choice_ref, \@branches) ) {
838 ($node,
839 $choice,
840 $path_ref)
841 = last_unexplored_node(\@path, $choice, $choice_ref,
842 \@node_to_choice, \@branches);
843 @path = @$path_ref;
844 last BLOCK_A if ($node == -1);
845 }
846 ($node,
847 $choice,
848 $node_count,
849 $choice_ref,
850 $path_ref,
851 $branch_ref,
852 $node_to_choice,
853 $node_contains,) = next_node ($node,
854 $choice,
855 $node_count,
856 $choice_ref,
857 \@path,
858 \@branches,
859 $tf_ref);
860 @choices = @$choice_ref;
861 @path = @$path_ref;
862 @branches = @$branch_ref;
863 $node_contains[$node] = $node_contains;
864 $node_to_choice[$node] = $node_to_choice;
865 }
866 return $success;
867 }
868
869
870 sub next_node {
871
872 my ($node,
873 my $choice,
874 my $node_count,
875 my $choice_ref,
876 my $path_ref,
877 my $branch_ref,
878 my $tf_ref) = @_;
879 my @choices = @$choice_ref;
880 my @path = @$path_ref;
881 my @branches = @$branch_ref;
882 my @tfs = @$tf_ref;
883 my $new_choice = $choices[$choice][0];
884 push @{ $choices[$choice] }, shift @{ $choices[$choice] };
885 $choice = $new_choice;
886 my $node_to_choice = $choice;
887 push @path, $node if $node;
888 $branches[$node]++;
889 $node = $node_count++;
890 my $node_contains = $tfs[$choice - 1] if $choice;
891 return (
892 $node,
893 $choice,
894 $node_count,
895 \@choices,
896 \@path,
897 \@branches,
898 $node_to_choice,
899 $node_contains);
900 }
901
902 sub node_fully_explored {
903 no strict "vars";
904
905 my $choice = shift;
906 my $node = shift;
907 my $choices_ref = shift;
908 my $branch_ref = shift;
909 my @choices = @$choices_ref;
910 my @branches = @$branch_ref;
911 if ($branches[$node] == (scalar @{ $choices[$choice] })) {
912 return 1 }
913 else { return 0 }
914 }
915
916 sub last_unexplored_node {
917 no strict "vars";
918
919 my $path_ref = shift;
920 my $choice = shift;
921 my $choice_ref = shift;
922 my $node_to_choice_ref = shift;
923 my $branch_ref = shift;
924 my @path = @$path_ref;
925 my @node_to_choice = @$node_to_choice_ref;
926 my @branches = @$branch_ref;
927 my $node;
928
929 do {
930 $node = pop @path;
931 $choice = $node_to_choice[$node];
932 if ( $node == 0 and node_fully_explored($choice, $node,
933 $choice_ref, \@branches) ) {
934 $node = -1;
935 last;
936 }
937 } while ( node_fully_explored($choice, $node, $choice_ref,
938 \@branches) );
939 return ($node, $choice, \@path);
940 }
941
942 sub node_terminating {
943
944 my $choice = shift;
945 my $choices_ref = shift;
946 my @choices = @$choices_ref;
947 if ($choices[$choice][0]) { return 0 }
948 else { return 1 }
949
950 }
951
952
953
954
955
956
957
958
959
960
961
962
963
964