Mercurial > repos > mvdbeek > damidseq_core
comparison find_peaks @ 0:a9c5f3773770 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/damidseq_findpeaks commit 87c99a97cb2ce55640afdd2e55c8b3ae5ad99324
| author | mvdbeek | 
|---|---|
| date | Fri, 20 Apr 2018 04:31:09 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:a9c5f3773770 | 
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 # Copyright © 2012-2015, Owen Marshall | |
| 4 | |
| 5 # This program is free software; you can redistribute it and/or modify | |
| 6 # it under the terms of the GNU General Public License as published by | |
| 7 # the Free Software Foundation; either version 2 of the License, or (at | |
| 8 # your option) any later version. | |
| 9 # | |
| 10 # This program is distributed in the hope that it will be useful, but | |
| 11 # WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
| 13 # General Public License for more details. | |
| 14 # | |
| 15 # You should have received a copy of the GNU General Public License | |
| 16 # along with this program; if not, write to the Free Software | |
| 17 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 | |
| 18 # USA | |
| 19 | |
| 20 use strict; | |
| 21 use File::Basename; | |
| 22 use 5.010; | |
| 23 $|++; | |
| 24 | |
| 25 my $version = "1.0.1"; | |
| 26 | |
| 27 print STDERR "\nfind_peaks v$version\nCopyright © 2012-15, Owen Marshall\n\n"; | |
| 28 | |
| 29 my %vars = ( | |
| 30 'fdr' => 0.01, | |
| 31 'min_count' => 2, | |
| 32 'n' => 100, | |
| 33 'frac' => 0, | |
| 34 'min_quant' => 0.95, | |
| 35 'step' => 0.01, | |
| 36 'unified_peaks' => 'max', | |
| 37 ); | |
| 38 | |
| 39 my %vars_details = ( | |
| 40 'fdr' => 'False discovery rate value', | |
| 41 'min_count' => 'Minimum number of fragments to consider as a peak', | |
| 42 'n' => 'Number of iterations', | |
| 43 'frac' => 'Number of random fragments to consider per iteration', | |
| 44 'min_quant' => 'Minimum quantile for considering peaks', | |
| 45 'step' => 'Stepping for quantiles', | |
| 46 'unified_peaks' => "Method for calling peak overlaps (two options):\n\r'min': call minimum overlapping peak area\n\r'max': call maximum overlap as peak", | |
| 47 ); | |
| 48 | |
| 49 | |
| 50 my @in_files; | |
| 51 process_cli(); | |
| 52 | |
| 53 # Time and date | |
| 54 my ($sec,$min,$hour,$mday,$mon,$year) = localtime(); | |
| 55 my $date = sprintf("%04d-%02d-%02d.%02d-%02d-%02d",$year+1900,$mon+1,$mday,$hour,$min,$sec); | |
| 56 | |
| 57 help() unless @in_files; | |
| 58 | |
| 59 foreach my $fn (@in_files) { | |
| 60 my @in; | |
| 61 my @unified_peaks; | |
| 62 my @sig_peaks; | |
| 63 my @peakmins; | |
| 64 | |
| 65 my %peaks; | |
| 66 my %peak_count; | |
| 67 my %peak_count_real; | |
| 68 my %log_scores; | |
| 69 my %regression; | |
| 70 my %peak_fdr_cutoff; | |
| 71 my %fdr; | |
| 72 | |
| 73 # Output file names | |
| 74 my ($name,$dir,$ext) = fileparse($fn, qr/\.[^.]*/); | |
| 75 | |
| 76 # filenames | |
| 77 my $fn_base_date = "peak_analysis.".$name.".$date"; | |
| 78 my $base_dir = "$fn_base_date/"; | |
| 79 my $out = "$base_dir"."$name-FDR$vars{'fdr'}"; | |
| 80 my $out_peak_unified_track = $out.".peaks.gff"; | |
| 81 my $out_peaks = $out."_FDR-data"; | |
| 82 | |
| 83 # Load gff data files | |
| 84 load_gff(FILE=>$fn, IN_REF=>\@in); | |
| 85 | |
| 86 my $probes = @in; | |
| 87 | |
| 88 find_quants(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins); | |
| 89 find_randomised_peaks(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks); | |
| 90 | |
| 91 # Make directory | |
| 92 mkdir($base_dir); | |
| 93 | |
| 94 # Open peaks file for writing | |
| 95 open(OUTP, ">$out_peaks")|| die "Cannot open peak file for writing: $!\n"; | |
| 96 print OUTP "FDR peak call v$version\n\n"; | |
| 97 print OUTP "Input file: $fn\n"; | |
| 98 | |
| 99 calculate_regressions(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAKS=>\%peaks, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression); | |
| 100 | |
| 101 call_peaks_unified_redux(ITER=>1, REAL=>1, AREF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAKS=>\%peaks); | |
| 102 | |
| 103 calculate_fdr(IN_REF=>\@in, PEAKMINS_REF=>\@peakmins, PEAK_COUNT=>\%peak_count, PEAK_COUNT_REAL=>\%peak_count_real, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr, LOG_SCORES=>\%log_scores, REGRESSION=>\%regression); | |
| 104 | |
| 105 find_significant_peaks(PEAKMINS=>\@peakmins, SIG_PEAKS=>\@sig_peaks, PEAKS=>\%peaks, PEAK_FDR_CUTOFF=>\%peak_fdr_cutoff, FDR=>\%fdr); | |
| 106 | |
| 107 make_unified_peaks(SIG_PEAKS=>\@sig_peaks, UNIFIED_PEAKS=>\@unified_peaks, OUT=>$out_peak_unified_track, TYPE=>$vars{'unified_peaks'}); | |
| 108 | |
| 109 print STDERR "$#unified_peaks peaks found.\n\n"; | |
| 110 | |
| 111 close OUTP; | |
| 112 } | |
| 113 | |
| 114 print STDERR "All done.\n\n"; | |
| 115 exit 0; | |
| 116 | |
| 117 | |
| 118 ###################### | |
| 119 # Subroutines start here | |
| 120 # | |
| 121 | |
| 122 sub find_significant_peaks { | |
| 123 my (%opts) = @_; | |
| 124 | |
| 125 my $peakmins = $opts{PEAKMINS}; | |
| 126 my $peaks = $opts{PEAKS}; | |
| 127 my $sig_peaks = $opts{SIG_PEAKS}; | |
| 128 my $fdr = $opts{FDR}; | |
| 129 my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF}; | |
| 130 | |
| 131 # Generate significant peaks and unify peaks | |
| 132 print STDERR "Selecting significant peaks ... \n"; | |
| 133 | |
| 134 foreach my $pm (@$peakmins) { | |
| 135 for my $i (0 .. $#{$$peaks{$pm}}) { | |
| 136 my ($chr, $pstart, $pend, $mean_pscore, $pscore, $count, $size) = @{ $$peaks{$pm}[$i] }; | |
| 137 if ($count >= $$peak_fdr_cutoff{$pm}) { | |
| 138 push (@$sig_peaks, [ @{$$peaks{$pm}[$i]}, $$fdr{$pm}[$count] ]); | |
| 139 } | |
| 140 } | |
| 141 } | |
| 142 | |
| 143 print OUTP "\nNumber of peaks: $#$sig_peaks\n"; | |
| 144 } | |
| 145 | |
| 146 sub calculate_fdr { | |
| 147 my (%opts) = @_; | |
| 148 my $in_ref = $opts{IN_REF}; | |
| 149 my $peakmins = $opts{PEAKMINS_REF}; | |
| 150 my $peak_count = $opts{PEAK_COUNT}; | |
| 151 my $log_scores = $opts{LOG_SCORES}; | |
| 152 my $regression = $opts{REGRESSION}; | |
| 153 my $fdr = $opts{FDR}; | |
| 154 my $peak_fdr_cutoff = $opts{PEAK_FDR_CUTOFF}; | |
| 155 my $peak_count_real = $opts{PEAK_COUNT_REAL}; | |
| 156 | |
| 157 foreach my $pm (@$peakmins) { | |
| 158 # get regression variables | |
| 159 my ($m,$b) = @{$$regression{$pm}} if $$regression{$pm}[0]; | |
| 160 | |
| 161 for my $i (0 .. $#{$$peak_count_real{$pm}}) { | |
| 162 next unless $$peak_count_real{$pm}[$i]; | |
| 163 my $expect = 10**($m*$i + $b); | |
| 164 | |
| 165 my $real_count = $$peak_count_real{$pm}[$i]; | |
| 166 my $fdr_conservative = $expect/$real_count; | |
| 167 $$fdr{$pm}[$i]= $fdr_conservative; | |
| 168 } | |
| 169 } | |
| 170 | |
| 171 # print FDR rates | |
| 172 print OUTP "\n"; | |
| 173 | |
| 174 foreach my $pm (@$peakmins) { | |
| 175 print OUTP "Peak min = $pm\n"; | |
| 176 for my $c (0 .. $#{$$fdr{$pm}}) { | |
| 177 next unless defined($$fdr{$pm}[$c]); | |
| 178 print OUTP "Peak size: $c\tCount: $$peak_count_real{$pm}[$c]\tFDR: $$fdr{$pm}[$c]\n"; | |
| 179 $$peak_fdr_cutoff{$pm} = $c if (($$fdr{$pm}[$c]<$vars{'fdr'}) && (!$$peak_fdr_cutoff{$pm})); | |
| 180 } | |
| 181 $$peak_fdr_cutoff{$pm}||= 10**10; # clumsy hack to prevent errors | |
| 182 print OUTP "\n"; | |
| 183 } | |
| 184 | |
| 185 foreach my $pm (@$peakmins) { | |
| 186 print OUTP "Peak min $pm: peak cutoff size for alpha = $vars{'fdr'} was $$peak_fdr_cutoff{$pm}\n\n"; | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 sub calculate_regressions { | |
| 191 my (%opts) = @_; | |
| 192 my $in_ref = $opts{IN_REF}; | |
| 193 my $peakmins = $opts{PEAKMINS_REF}; | |
| 194 my $peaks = $opts{PEAKS}; | |
| 195 my $peak_count = $opts{PEAK_COUNT}; | |
| 196 my $log_scores = $opts{LOG_SCORES}; | |
| 197 my $regression = $opts{REGRESSION}; | |
| 198 | |
| 199 my $in_num = @$in_ref; | |
| 200 | |
| 201 foreach my $pm (@$peakmins) { | |
| 202 print OUTP "Peak min = $pm\n"; | |
| 203 for my $c (0 .. $#{$$peak_count{$pm}}) { | |
| 204 my $peak_count_avg = $$peak_count{$pm}[$c]/$vars{'n'} if $$peak_count{$pm}[$c]; | |
| 205 next unless $peak_count_avg; | |
| 206 | |
| 207 if ($vars{'frac'}) { | |
| 208 $peak_count_avg = $peak_count_avg * $in_num/$vars{'frac'}; | |
| 209 } | |
| 210 $$log_scores{$pm}[$c] = log($peak_count_avg)/log(10); | |
| 211 print OUTP "Peak size: $c\tCount:$peak_count_avg\n"; | |
| 212 } | |
| 213 | |
| 214 # calculate exponential decay rates | |
| 215 # y= a+bx for log(y) | |
| 216 my ($sumx, $sumy, $sumxsq, $sumxy); | |
| 217 my $n=0; | |
| 218 for my $i (0 .. $#{$$peak_count{$pm}}) { | |
| 219 next unless $$peak_count{$pm}[$i]; | |
| 220 $n++; | |
| 221 $sumx += $i; | |
| 222 $sumy += $$log_scores{$pm}[$i]; | |
| 223 $sumxsq += $i ** 2; | |
| 224 $sumxy += $i * $$log_scores{$pm}[$i]; | |
| 225 } | |
| 226 | |
| 227 next unless $n > 1; | |
| 228 | |
| 229 my $mean_x = $sumx/$n; | |
| 230 my $mean_y = $sumy/$n; | |
| 231 my $mean_xsq = $sumxsq/$n; | |
| 232 my $mean_xy = $sumxy/$n; | |
| 233 | |
| 234 my $b = ($mean_xy - ($mean_x * $mean_y)) / ($mean_xsq - ($mean_x **2)); | |
| 235 my $a = $mean_y - ($b * $mean_x); | |
| 236 | |
| 237 # store values | |
| 238 $$regression{$pm}=[$b, $a]; | |
| 239 print OUTP "regression: log(y) = $b(x) + $a\n"; | |
| 240 | |
| 241 for my $i (0 .. $#{$$peak_count{$pm}}) { | |
| 242 next unless $$peak_count{$pm}[$i]; | |
| 243 my ($b,$a) = @{$$regression{$pm}}; | |
| 244 my $logval = ($b*$i + $a); | |
| 245 my $val = 10**$logval; | |
| 246 print OUTP "lin regress: $i\t$$log_scores{$pm}[$i]\t$logval\t$val\n"; | |
| 247 } | |
| 248 | |
| 249 print OUTP "\n"; | |
| 250 } | |
| 251 } | |
| 252 | |
| 253 sub find_randomised_peaks { | |
| 254 my (%opts) = @_; | |
| 255 my $in_ref = $opts{IN_REF}; | |
| 256 my $peakmins_ref = $opts{PEAKMINS_REF}; | |
| 257 my $peaks = $opts{PEAKS}; | |
| 258 my $peak_count = $opts{PEAK_COUNT}; | |
| 259 | |
| 260 print STDERR "Duplicating ... \n"; | |
| 261 my @inr = @$in_ref; | |
| 262 | |
| 263 # Call peaks on input file | |
| 264 print STDERR "Calling peaks on input file ...\n"; | |
| 265 | |
| 266 for my $iter (1 .. $vars{'n'}) { | |
| 267 #print STDERR "Iteration $iter ... \r"; | |
| 268 print STDERR "Iteration $iter: [shuffling] \r"; | |
| 269 | |
| 270 if ($vars{'frac'}) { | |
| 271 # only use a fraction of the array per rep | |
| 272 my @a = inside_out(\@inr, $vars{'frac'}); | |
| 273 call_peaks_unified_redux(ITER=>$iter, AREF=>\@a, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count); | |
| 274 } else { | |
| 275 shuffle(\@inr); | |
| 276 call_peaks_unified_redux(ITER=>$iter, AREF=>\@inr, PEAKMINS_REF=>$peakmins_ref, PEAK_COUNT=>$peak_count); | |
| 277 } | |
| 278 } | |
| 279 | |
| 280 } | |
| 281 | |
| 282 sub call_peaks_unified_redux { | |
| 283 my (%opts) = @_; | |
| 284 my $iter = $opts{ITER}; | |
| 285 my $real = $opts{REAL}; | |
| 286 my $a = $opts{AREF}; | |
| 287 my $peakmins_ref = $opts{PEAKMINS_REF}; | |
| 288 my $peak_count_real = $opts{PEAK_COUNT_REAL}; | |
| 289 my $peaks = $opts{PEAKS}; | |
| 290 my $peak_count = $opts{PEAK_COUNT}; | |
| 291 | |
| 292 my ($pstart, $pend, $inpeak, $pscore, $count); | |
| 293 $pstart=$pend=$pscore=$inpeak=$count=0; | |
| 294 | |
| 295 my @tmp_peak; | |
| 296 my $total = $#$a; | |
| 297 | |
| 298 if ($real) { | |
| 299 print STDERR "Calling real peaks ... \r"; | |
| 300 } else { | |
| 301 print STDERR "Iteration $iter: [processing ...] \r"; | |
| 302 } | |
| 303 | |
| 304 my $old_chr=""; | |
| 305 foreach my $pm (@$peakmins_ref) { | |
| 306 for my $i (0 .. $total) { | |
| 307 my ($chr, $start, $end, $score) = @{ @$a[$i] }; | |
| 308 next unless $score; | |
| 309 | |
| 310 if ($real) { | |
| 311 unless ($chr eq $old_chr) { | |
| 312 # Next chromosome | |
| 313 # (Peaks can't carry over chromosomes, but we don't use this shortcut when randomly shuffling) | |
| 314 $pstart=$pend=$pscore=$inpeak=$count=0; | |
| 315 @tmp_peak = () if $real; | |
| 316 } | |
| 317 } | |
| 318 $old_chr = $chr if $real; | |
| 319 | |
| 320 unless ($inpeak) { | |
| 321 next unless $score >= $pm; | |
| 322 # record new peak | |
| 323 $pstart = $start; | |
| 324 $pend = $end; | |
| 325 $pscore = $score * ($end-$start)/1000; | |
| 326 $count++; | |
| 327 push @tmp_peak, $score if $real; | |
| 328 $inpeak = 1; | |
| 329 } else { | |
| 330 if ($score >= $pm) { | |
| 331 # still in peak | |
| 332 $count++; | |
| 333 | |
| 334 # Fragment score to deal with scoring peaks made from uneven sized fragments | |
| 335 my $fragment_score = $score * ($end-$start)/1000; | |
| 336 | |
| 337 push @tmp_peak, $score if $real; | |
| 338 $pscore += $fragment_score; | |
| 339 $pend = $end; | |
| 340 } else { | |
| 341 # out of a peak | |
| 342 if ($count >= $vars{'min_count'}) { | |
| 343 # record peak | |
| 344 if ($real) { | |
| 345 $$peak_count_real{$pm}[$count]++; | |
| 346 my $mean_pscore = sprintf('%0.2f',($pscore/(($pend-$pstart)/1000))); | |
| 347 push (@{$$peaks{$pm}},[($chr, $pstart, $pend, $mean_pscore, $pscore, $count, ($pend-$pstart))]); | |
| 348 } else { | |
| 349 $$peak_count{$pm}[$count]++; | |
| 350 } | |
| 351 } | |
| 352 | |
| 353 # reset | |
| 354 $pstart=$pend=$pscore=$inpeak=$count=0; | |
| 355 @tmp_peak = () if $real; | |
| 356 } | |
| 357 } | |
| 358 } | |
| 359 } | |
| 360 } | |
| 361 | |
| 362 | |
| 363 sub shuffle { | |
| 364 # Fisher-Yates shuffle (Knuth shuffle) | |
| 365 my ($array) = @_; | |
| 366 my $i = @$array; | |
| 367 while ( --$i ) { | |
| 368 my $j = int rand( $i+1 ); | |
| 369 @$array[$i,$j] = @$array[$j,$i]; | |
| 370 } | |
| 371 } | |
| 372 | |
| 373 sub inside_out { | |
| 374 my ($array, $frac) = @_; | |
| 375 my @a; | |
| 376 for my $i (0 .. $frac-1) { | |
| 377 my $j = int rand( $i+1 ); | |
| 378 if ($j != $i) { | |
| 379 $a[$i] = $a[$j] | |
| 380 } | |
| 381 $a[$j] = @$array[$i] | |
| 382 } | |
| 383 return @a; | |
| 384 } | |
| 385 | |
| 386 sub load_gff { | |
| 387 my (%opts) = @_; | |
| 388 my $fn = $opts{FILE}; | |
| 389 my $in_ref = $opts{IN_REF}; | |
| 390 | |
| 391 print STDERR "Reading input file: $fn ...\n"; | |
| 392 open (IN, "<$fn") || die "Unable to read $fn: $!\n"; | |
| 393 | |
| 394 my $i; | |
| 395 while (<IN>) { | |
| 396 $i++; | |
| 397 print STDERR "Read $i lines ...\r" if $i%10000 == 0; | |
| 398 chomp; | |
| 399 | |
| 400 my @line = split('\t'); | |
| 401 | |
| 402 my ($chr, $start, $end, $score); | |
| 403 if ($#line == 3) { | |
| 404 # bedgraph | |
| 405 ($chr, $start, $end, $score) = @line; | |
| 406 } else { | |
| 407 # GFF | |
| 408 ($chr, $start, $end, $score) = @line[0,3,4,5]; | |
| 409 } | |
| 410 | |
| 411 next unless $start; | |
| 412 | |
| 413 $score = 0 if $score eq "NA"; | |
| 414 | |
| 415 push (@$in_ref, [$chr, $start, $end, $score]); | |
| 416 } | |
| 417 | |
| 418 close (IN); | |
| 419 | |
| 420 print STDERR "Sorting ... \n"; | |
| 421 @$in_ref = sort { $a->[1] <=> $b->[1] } @$in_ref; | |
| 422 @$in_ref = sort { $a->[0] cmp $b->[0] } @$in_ref; | |
| 423 } | |
| 424 | |
| 425 sub find_quants { | |
| 426 my (%opts) = @_; | |
| 427 | |
| 428 my $in_ref = $opts{IN_REF}; | |
| 429 my $peakmins_ref = $opts{PEAKMINS_REF}; | |
| 430 | |
| 431 my %seg; | |
| 432 my @frags; | |
| 433 | |
| 434 my $total_coverage; | |
| 435 | |
| 436 foreach my $l (@$in_ref) { | |
| 437 my ($chr, $start, $end, $score) = @$l; | |
| 438 next unless $score; | |
| 439 $score = 0 if $score eq "NA"; | |
| 440 $total_coverage += $end-$start; | |
| 441 push @frags, $score; | |
| 442 } | |
| 443 | |
| 444 @frags = sort {$a <=> $b} @frags; | |
| 445 | |
| 446 print STDERR "Total coverage was $total_coverage bp\n"; | |
| 447 | |
| 448 my @quants; | |
| 449 for (my $q=0;$q<=1;$q+=$vars{'step'}) { | |
| 450 push @quants, [$q, int($q * @frags)] if $q > $vars{'min_quant'}; | |
| 451 } | |
| 452 | |
| 453 foreach (@quants) { | |
| 454 my $cut_off = @{$_}[0]; | |
| 455 my $score = $frags[@{$_}[1]]; | |
| 456 printf(" Quantile %0.2f: %0.2f\n",$cut_off,$score); | |
| 457 $seg{$cut_off} = $score; | |
| 458 } | |
| 459 | |
| 460 foreach my $c (sort {$a <=> $b} keys %seg) { | |
| 461 push (@$peakmins_ref, $seg{$c}); | |
| 462 } | |
| 463 } | |
| 464 | |
| 465 sub make_unified_peaks { | |
| 466 my (%opts) = @_; | |
| 467 my $ref = $opts{SIG_PEAKS}; | |
| 468 my $out_peak_unified_track = $opts{OUT}; | |
| 469 my $unified_peaks = $opts{UNIFIED_PEAKS}; | |
| 470 my $type = $opts{TYPE}; | |
| 471 my $total = @$ref; | |
| 472 | |
| 473 # Unify overlapping peaks, and make significant peaks file | |
| 474 my $skipped_peaks; | |
| 475 print STDERR "Combining significant peaks ...\n"; | |
| 476 | |
| 477 # unroll chromosomes for speed | |
| 478 foreach my $chr (uniq( map @{$_}[0], @$ref )) { | |
| 479 my @c = grep {@{$_}[0] eq $chr} @$ref; | |
| 480 | |
| 481 my @unified_peaks_chr; | |
| 482 foreach my $ar (@c) { | |
| 483 if (state $i++ % 100 == 0) { | |
| 484 my $pc = sprintf("%0.2f",($i*100)/$total); | |
| 485 print STDERR "$pc\% processed ...\r"; | |
| 486 } | |
| 487 | |
| 488 my ($chra, $start, $end, $score, $total_score, $count, $peaklen, $fdr) = @$ar; | |
| 489 | |
| 490 # next if @unified_peaks_chr already overlaps | |
| 491 next if grep { | |
| 492 @{$_}[3] < $end | |
| 493 && @{$_}[4] > $start | |
| 494 } @unified_peaks_chr; | |
| 495 | |
| 496 # Grab all elements that overlap | |
| 497 my @test = grep { | |
| 498 @{$_}[1] < $end | |
| 499 && @{$_}[2] > $start | |
| 500 } @c; | |
| 501 | |
| 502 for my $j (0 .. $#test) { | |
| 503 my ($chr1, $start1, $end1, $score1, $total_score1, $count1, $peaklen1, $fdr1) = @{$test[$j]}; | |
| 504 | |
| 505 next unless $start1 < $end; | |
| 506 next unless $end1 > $start; | |
| 507 | |
| 508 if ($type eq 'min') { | |
| 509 $start = max($start, $start1); | |
| 510 $end = min($end, $end1); | |
| 511 } else { | |
| 512 $start = min($start, $start1); | |
| 513 $end = max($end, $end1); | |
| 514 } | |
| 515 | |
| 516 $score = max($score, $score1); | |
| 517 $fdr = min($fdr, $fdr1); | |
| 518 } | |
| 519 | |
| 520 push @unified_peaks_chr, [($chr, '.', '.', $start, $end, $score, '.', '.', "FDR=$fdr")]; | |
| 521 } | |
| 522 | |
| 523 @$unified_peaks = (@$unified_peaks, @unified_peaks_chr); | |
| 524 } | |
| 525 | |
| 526 $total = $#$unified_peaks; | |
| 527 | |
| 528 print STDERR "Sorting unified peaks ...\n"; | |
| 529 @$unified_peaks = sort { $a->[3] <=> $b->[3] } @$unified_peaks; | |
| 530 @$unified_peaks = sort { $a->[0] cmp $b->[0] } @$unified_peaks; | |
| 531 | |
| 532 print STDERR "Writing unified peaks file ...\n"; | |
| 533 open(PEAKOUTUNI, ">$out_peak_unified_track") || die "Unable to open peak output track for writing: $!\n\n"; | |
| 534 for my $j (0 .. $#$unified_peaks) { | |
| 535 print PEAKOUTUNI join("\t", @{$$unified_peaks[$j]}), "\n"; | |
| 536 } | |
| 537 } | |
| 538 | |
| 539 sub max { | |
| 540 my ($max, @vars) = @_; | |
| 541 my $index=0; | |
| 542 $max||=0; | |
| 543 for my $i (0..$#vars) { | |
| 544 ($max, $index) = ($vars[$i], $i+1) if $vars[$i] > $max; | |
| 545 } | |
| 546 return $max; | |
| 547 } | |
| 548 | |
| 549 sub min { | |
| 550 my ($min, @vars) = @_; | |
| 551 my $index=0; | |
| 552 $min||=0; | |
| 553 for my $i (0..$#vars) { | |
| 554 ($min, $index) = ($vars[$i],$i+1) if $vars[$i] < $min; | |
| 555 } | |
| 556 return $min; | |
| 557 } | |
| 558 | |
| 559 sub uniq { | |
| 560 my %seen; | |
| 561 return grep { !$seen{$_}++ } @_; | |
| 562 } | |
| 563 | |
| 564 | |
| 565 | |
| 566 sub process_cli { | |
| 567 foreach (@ARGV) { | |
| 568 if (/--(.*)=(.*)/) { | |
| 569 unless (defined($vars{$1})) { | |
| 570 print STDERR "Did not understand $_ ...\n"; | |
| 571 help(); | |
| 572 } | |
| 573 my ($v, $opt) = ($1,$2); | |
| 574 $vars{$v} = $opt; | |
| 575 next; | |
| 576 } elsif (/--h[elp]*/) { | |
| 577 help(); | |
| 578 } elsif (/--(.*)/) { | |
| 579 print STDERR "Please add a parameter to $_ ...\n\n"; | |
| 580 exit 1; | |
| 581 } | |
| 582 push @in_files, $_; | |
| 583 } | |
| 584 } | |
| 585 | |
| 586 | |
| 587 sub help { | |
| 588 print STDOUT <<EOT; | |
| 589 Simple FDR random permutation peak caller | |
| 590 Usage: [options] [files in bedgraph or GFF format] | |
| 591 | |
| 592 Options: | |
| 593 EOT | |
| 594 | |
| 595 my $opt_len = 0; | |
| 596 foreach (keys %vars) { | |
| 597 my $l = length($_); | |
| 598 $opt_len = $l if $l > $opt_len; | |
| 599 } | |
| 600 | |
| 601 $opt_len+=2; | |
| 602 | |
| 603 my $cols= `tput cols` || 80; | |
| 604 | |
| 605 my ($v, $val, $def, $def_format); | |
| 606 my $help_format = "format STDOUT =\n" | |
| 607 .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-4) . "\n" | |
| 608 .'$v, $def_format'."\n" | |
| 609 .' '.'^'.'<'x$opt_len . ' '. '^' . '<'x($cols-$opt_len-6) . "~~\n" | |
| 610 .'$v, $def_format'."\n" | |
| 611 .".\n"; | |
| 612 | |
| 613 eval $help_format; | |
| 614 die $@ if $@; | |
| 615 | |
| 616 foreach my $k (sort (keys %vars)) { | |
| 617 ($v, $val, $def) = ($k, $vars{$k}, $vars_details{$k}); | |
| 618 $def||=""; | |
| 619 $def_format = $val ? "$def\n\r[Current value: $val]" : $def; | |
| 620 $v = "--$v"; | |
| 621 # format = | |
| 622 # ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< | |
| 623 #$v, $def_format | |
| 624 # ^<<<<<<<<<<<<<<<<<<<< ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< ~~ | |
| 625 #$v, $def_format | |
| 626 #. | |
| 627 | |
| 628 write(); | |
| 629 | |
| 630 } | |
| 631 print STDOUT "\n"; | |
| 632 exit 1; | |
| 633 } | 
