annotate merge_pcr_duplicates.py @ 50:0b9aab6aaebf draft

Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
author rnateam
date Tue, 26 Jan 2016 04:38:27 -0500
parents 570a7de9f151
children 4bedd35bcdff
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
1 #!/usr/bin/env python
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
2
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
3 import argparse
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
4 import logging
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
5 from sys import stdout
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
6 import pandas as pd
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
7 from subprocess import check_call
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
8 from shutil import rmtree
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
9 from tempfile import mkdtemp
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
10 from os.path import isfile
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
11 # avoid ugly python IOError when stdout output is piped into another program
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
12 # and then truncated (such as piping to head)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
13 from signal import signal, SIGPIPE, SIG_DFL
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
14 signal(SIGPIPE, SIG_DFL)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
15
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
16 tool_description = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
17 Merge PCR duplicates according to random barcode library.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
18
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
19 Barcodes containing uncalled base 'N' are removed. By default output is written
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
20 to stdout.
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
21
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
22 Input:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
23 * bed6 file containing alignments with fastq read-id in name field
8
17ef0e0dae68 Uploaded
rnateam
parents: 2
diff changeset
24 * fastq library of random barcodes
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
25
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
26 Output:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
27 * bed6 file with random barcode in name field and number of PCR duplicates as
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
28 score, sorted by fields chrom, start, stop, strand, name
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
29
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
30 Example usage:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
31 - read PCR duplicates from file duplicates.bed and write merged results to file
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
32 merged.bed:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
33 merge_pcr_duplicates.py duplicates.bed bclibrary.fa --out merged.bed
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
34 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
35
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
36 epilog = """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
37 Author: Daniel Maticzka
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
38 Copyright: 2015
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
39 License: Apache
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
40 Email: maticzkd@informatik.uni-freiburg.de
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
41 Status: Testing
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
42 """
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
43
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
44 # parse command line arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
45 parser = argparse.ArgumentParser(description=tool_description,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
46 epilog=epilog,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
47 formatter_class=argparse.RawDescriptionHelpFormatter)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
48 # positional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
49 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
50 "alignments",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
51 help="Path to bed6 file containing alignments.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
52 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
53 "bclib",
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
54 help="Path to fastq barcode library.")
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
55 # optional arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
56 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
57 "-o", "--outfile",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
58 help="Write results to this file.")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
59 # misc arguments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
60 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
61 "-v", "--verbose",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
62 help="Be verbose.",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
63 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
64 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
65 "-d", "--debug",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
66 help="Print lots of debugging information",
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
67 action="store_true")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
68 parser.add_argument(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
69 '--version',
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
70 action='version',
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
71 version='0.2.0')
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
72
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
73 args = parser.parse_args()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
74
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
75 if args.debug:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
76 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
77 elif args.verbose:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
78 logging.basicConfig(level=logging.INFO, format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
79 else:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
80 logging.basicConfig(format="%(filename)s - %(levelname)s - %(message)s")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
81 logging.info("Parsed arguments:")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
82 logging.info(" alignments: '{}'".format(args.alignments))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
83 logging.info(" bclib: '{}'".format(args.bclib))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
84 if args.outfile:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
85 logging.info(" outfile: enabled writing to file")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
86 logging.info(" outfile: '{}'".format(args.outfile))
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
87 logging.info("")
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
88
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
89 # see if alignments are empty and the tool can quit
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
90 n_alns = sum(1 for line in open(args.alignments))
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
91 if n_alns == 0:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
92 logging.warning("WARNING: Working on empty set of alignments, writing empty output.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
93 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
94 eventalnout.close()
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
95 exit(0)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
96
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
97 # check input filenames
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
98 if not isfile(args.bclib):
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
99 raise Exception("ERROR: barcode library '{}' not found.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
100 if not isfile(args.alignments):
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
101 raise Exception("ERROR: alignments '{}' not found.")
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
102
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
103 try:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
104 tmpdir = mkdtemp()
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
105 logging.debug("tmpdir: " + tmpdir)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
106
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
107 # prepare barcode library
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
108 syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv"
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
109 check_call(syscall1, shell=True)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
110
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
111 # prepare alinments
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
112 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv"
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
113 check_call(syscall2, shell=True)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
114
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
115 # join barcode library and alignments
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
116 syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv"
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
117 check_call(syscall3, shell=True)
14
570a7de9f151 read from bam; fix header issue
rnateam
parents: 8
diff changeset
118
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
119 # get alignments combined with barcodes
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
120 bcalib = pd.read_csv(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
121 tmpdir + "/bcalib.csv",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
122 sep="\t",
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
123 names=["chrom", "start", "stop", "bc", "score", "strand"])
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
124 finally:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
125 logging.debug("removed tmpdir: " + tmpdir)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
126 rmtree(tmpdir)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
127
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
128 # fail if alignments given but combined library is empty
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
129 if bcalib.empty:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
130 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
131
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
132 # warn if not all alignments could be assigned a barcode
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
133 n_bcalib = len(bcalib.index)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
134 if n_bcalib < n_alns:
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
135 logging.warning(
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
136 "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns))
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
137
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
138 # remove entries with barcodes that has uncalled base N
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
139 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
140 n_bcalib_cleaned = len(bcalib_cleaned)
50
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
141 # if n_bcalib_cleaned < n_bcalib:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
142 # msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
143 # n_bcalib - n_bcalib_cleaned, n_bcalib)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
144 # if n_bcalib_cleaned < (0.8 * n_bcalib):
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
145 # logging.warning(msg)
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
146 # else:
0b9aab6aaebf Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
rnateam
parents: 14
diff changeset
147 # logging.info(msg)
2
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
148
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
149 # count and merge pcr duplicates
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
150 # grouping sorts by keys, so the ouput will be properly sorted
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
151 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
152 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
153
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
154 # write coordinates of crosslinking event alignments
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
155 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
156 merged.to_csv(
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
157 eventalnout,
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
158 columns=['chrom', 'start', 'stop', 'bc', 'ndupes', 'strand'],
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
159 sep="\t", index=False, header=False)
de4ea3aa1090 Uploaded
rnateam
parents:
diff changeset
160 eventalnout.close()