comparison rm_spurious_events.pl @ 50:0b9aab6aaebf draft

Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
author rnateam
date Tue, 26 Jan 2016 04:38:27 -0500
parents
children
comparison
equal deleted inserted replaced
49:303f6402a035 50:0b9aab6aaebf
1 #!/usr/local/perl/bin/perl
2 use feature ':5.10';
3 use strict 'vars';
4 use warnings;
5 use Getopt::Long;
6 use Pod::Usage;
7 use List::Util qw/ min max /;
8 use POSIX qw/ceil floor/;
9 use File::Temp qw(tempdir);
10 use File::Basename;
11
12 =head1 NAME
13
14 filterSquashedReads.pl --frac_max FLOAT
15
16 =head1 SYNOPSIS
17
18 this script reads sites after pcr duplicate removal.
19 for all crosslinking sites sharing start and stop coordinates, the maximum number
20 of squashed reads is determined.
21 alignments having less than frac_max * max reads are discarded
22
23 assumes bed entries to be sorted chr,strand,start,stop,score with score descending
24
25 Options:
26
27 --frac_max filter out alignments supported by less reads than this fraction of the maximum number of reads per position
28 -debug enable debug output
29 -help brief help message
30 -man full documentation
31
32 =head1 DESCRIPTION
33
34 =cut
35
36 ###############################################################################
37 # parse command line options
38 ###############################################################################
39 my $frac_max = 0.1;
40 my $help;
41 my $man;
42 my $debug;
43 my $result = GetOptions ( "frac_max=f" => \$frac_max,
44 "help" => \$help,
45 "man" => \$man,
46 "debug" => \$debug);
47 pod2usage(-exitstatus => 1, -verbose => 1) if $help;
48 pod2usage(-exitstatus => 0, -verbose => 2) if $man;
49 ($result) or pod2usage(2);
50
51 ###############################################################################
52 # main
53 ###############################################################################
54 my $current_chr = '';
55 my $current_start = -1;
56 my $current_stop = -1;
57 my $current_strand = '';
58 my $current_max = -1;
59 my $current_threshold = -1;
60
61 while (<>) {
62 my ($chr, $start, $stop, $id, $count, $strand) = split("\t");
63 # my ($count, undef, $start, $chr, $strand, $stop) = split("\t");
64
65 if ($current_start != $start or
66 $current_stop != $stop or
67 $current_chr ne $chr or
68 $current_strand ne $strand) {
69 # if this is the first occourence of these coordinates
70 # this must be the new maximum
71 # save new state
72 $current_start = $start;
73 $current_stop = $stop;
74 $current_chr = $chr;
75 $current_strand = $strand;
76 # print record and record maximum
77 $current_max = $count;
78 $current_threshold = $count*$frac_max;
79 print $_;
80 $debug and say "new threshold ${current_threshold} @ $chr $start $stop $strand $count";
81 } elsif ($count >= $current_threshold) {
82 # if it is not the first occourence, evaluate threshold and print if valid
83 print $_;
84 } else {
85 $debug and say "below threshold @ $chr $start $stop $strand $count";
86 }
87 }