Mercurial > repos > rnateam > bctools
comparison rm_spurious_events.py @ 50:0b9aab6aaebf draft
Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
author | rnateam |
---|---|
date | Tue, 26 Jan 2016 04:38:27 -0500 |
parents | de4ea3aa1090 |
children |
comparison
equal
deleted
inserted
replaced
49:303f6402a035 | 50:0b9aab6aaebf |
---|---|
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 | |
3 import argparse | |
4 import logging | |
5 from subprocess import check_call | |
6 import os | |
2 | 7 |
3 tool_description = """ | 8 tool_description = """ |
4 Remove spurious events originating from errors in random sequence tags. | 9 Remove spurious events originating from errors in random sequence tags. |
5 | 10 |
6 This script compares all events sharing the same coordinates. Among each group | 11 This script compares all events sharing the same coordinates. Among each group |
7 of events the maximum number of PCR duplicates is determined. All events that | 12 of events the maximum number of PCR duplicates is determined. All events that |
8 are supported by less than 10 percent of this maximum count are removed. | 13 are supported by less than 10 percent of this maximum count are removed. |
9 | |
10 By default output is written to stdout. | |
11 | 14 |
12 Input: | 15 Input: |
13 * bed6 file containing crosslinking events with score field set to number of PCR | 16 * bed6 file containing crosslinking events with score field set to number of PCR |
14 duplicates | 17 duplicates |
15 | 18 |
17 * bed6 file with spurious crosslinking events removed, sorted by fields chrom, | 20 * bed6 file with spurious crosslinking events removed, sorted by fields chrom, |
18 start, stop, strand | 21 start, stop, strand |
19 | 22 |
20 Example usage: | 23 Example usage: |
21 - remove spurious events from spurious.bed and write results to file cleaned.bed | 24 - remove spurious events from spurious.bed and write results to file cleaned.bed |
22 rm_spurious_events.py spurious.bed --out cleaned.bed | 25 rm_spurious_events.py spurious.bed --oufile cleaned.bed |
23 """ | 26 """ |
24 | 27 |
25 epilog = """ | 28 epilog = """ |
26 Author: Daniel Maticzka | 29 Author: Daniel Maticzka |
27 Copyright: 2015 | 30 Copyright: 2015 |
28 License: Apache | 31 License: Apache |
29 Email: maticzkd@informatik.uni-freiburg.de | 32 Email: maticzkd@informatik.uni-freiburg.de |
30 Status: Testing | 33 Status: Testing |
31 """ | 34 """ |
32 | 35 |
33 import argparse | |
34 import logging | |
35 from sys import stdout | |
36 import pandas as pd | |
37 | |
38 | 36 |
39 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, | 37 class DefaultsRawDescriptionHelpFormatter(argparse.ArgumentDefaultsHelpFormatter, |
40 argparse.RawDescriptionHelpFormatter): | 38 argparse.RawDescriptionHelpFormatter): |
41 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter | 39 # To join the behaviour of RawDescriptionHelpFormatter with that of ArgumentDefaultsHelpFormatter |
42 pass | 40 pass |
43 | 41 |
44 # avoid ugly python IOError when stdout output is piped into another program | |
45 # and then truncated (such as piping to head) | |
46 from signal import signal, SIGPIPE, SIG_DFL | |
47 signal(SIGPIPE, SIG_DFL) | |
48 | 42 |
49 # parse command line arguments | 43 def main(): |
50 parser = argparse.ArgumentParser(description=tool_description, | 44 # parse command line arguments |
51 epilog=epilog, | 45 parser = argparse.ArgumentParser(description=tool_description, |
52 formatter_class=DefaultsRawDescriptionHelpFormatter) | 46 epilog=epilog, |
53 # positional arguments | 47 formatter_class=DefaultsRawDescriptionHelpFormatter) |
54 parser.add_argument( | 48 # positional arguments |
55 "events", | 49 parser.add_argument( |
56 help="Path to bed6 file containing alignments.") | 50 "events", |
57 # optional arguments | 51 help="Path to bed6 file containing alignments.") |
58 parser.add_argument( | 52 # optional arguments |
59 "-o", "--outfile", | 53 parser.add_argument( |
60 help="Write results to this file.") | 54 "-o", "--outfile", |
61 parser.add_argument( | 55 required=True, |
62 "-t", "--threshold", | 56 help="Write results to this file.") |
63 type=float, | 57 parser.add_argument( |
64 default=0.1, | 58 "-t", "--threshold", |
65 help="Threshold for spurious event removal." | 59 type=float, |
66 ) | 60 default=0.1, |
67 # misc arguments | 61 help="Threshold for spurious event removal." |
68 parser.add_argument( | 62 ) |
69 "-v", "--verbose", | 63 # misc arguments |
70 help="Be verbose.", | 64 parser.add_argument( |
71 action="store_true") | 65 "-v", "--verbose", |
72 parser.add_argument( | 66 help="Be verbose.", |
73 "-d", "--debug", | 67 action="store_true") |
74 help="Print lots of debugging information", | 68 parser.add_argument( |
75 action="store_true") | 69 "-d", "--debug", |
76 parser.add_argument( | 70 help="Print lots of debugging information", |
77 '--version', | 71 action="store_true") |
78 action='version', | 72 parser.add_argument( |
79 version='0.1.0') | 73 '--version', |
74 action='version', | |
75 version='0.1.0') | |
80 | 76 |
81 args = parser.parse_args() | 77 args = parser.parse_args() |
82 | 78 |
83 if args.debug: | 79 if args.debug: |
84 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") | 80 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") |
85 elif args.verbose: | 81 elif args.verbose: |
86 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") | 82 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s") |
87 else: | 83 else: |
88 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") | 84 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s") |
89 logging.info("Parsed arguments:") | 85 logging.info("Parsed arguments:") |
90 logging.info(" alignments: '{}'".format(args.events)) | 86 logging.info(" alignments: '{}'".format(args.events)) |
91 logging.info(" threshold: '{}'".format(args.threshold)) | 87 logging.info(" threshold: '{}'".format(args.threshold)) |
92 if args.outfile: | 88 if args.outfile: |
93 logging.info(" outfile: enabled writing to file") | 89 logging.info(" outfile: enabled writing to file") |
94 logging.info(" outfile: '{}'".format(args.outfile)) | 90 logging.info(" outfile: '{}'".format(args.outfile)) |
95 logging.info("") | 91 logging.info("") |
96 | 92 |
97 # check threshold parameter value | 93 # check threshold parameter value |
98 if args.threshold < 0 or args.threshold > 1: | 94 if args.threshold < 0 or args.threshold > 1: |
99 raise ValueError("Threshold must be in [0,1].") | 95 raise ValueError("Threshold must be in [0,1].") |
100 | 96 |
101 # load alignments | 97 if not os.path.isfile(args.events): |
102 alns = pd.read_csv( | 98 raise Exception("ERROR: file '{}' not found.") |
103 args.events, | |
104 sep="\t", | |
105 names=["chrom", "start", "stop", "read_id", "score", "strand"]) | |
106 | 99 |
107 # remove all alignments that not enough PCR duplicates with respect to | 100 # prepare barcode library |
108 # the group maximum | 101 syscall = "cat " + args.events + " | sort -k1,1V -k6,6 -k2,2n -k3,3 -k5,5nr | perl " + os.path.dirname(os.path.realpath(__file__)) + "/rm_spurious_events.pl --frac_max " + str(args.threshold) + "| sort -k1,1V -k2,2n -k3,3n -k6,6 -k4,4 -k5,5nr > " + args.outfile |
109 grouped = alns.groupby(['chrom', 'start', 'stop', 'strand'], group_keys=False) | 102 check_call(syscall, shell=True) |
110 alns_cleaned = grouped.apply(lambda g: g[g["score"] >= args.threshold * g["score"].max()]) | |
111 | 103 |
112 # write coordinates of crosslinking event alignments | 104 |
113 alns_cleaned_out = (open(args.outfile, "w") if args.outfile is not None else stdout) | 105 if __name__ == "__main__": |
114 alns_cleaned.to_csv( | 106 main() |
115 alns_cleaned_out, | |
116 columns=['chrom', 'start', 'stop', 'read_id', 'score', 'strand'], | |
117 sep="\t", index=False, header=False) | |
118 alns_cleaned_out.close() |