comparison merge_pcr_duplicates.py @ 50:0b9aab6aaebf draft

Uploaded 16cfcafe8b42055c5dd64e62c42b82b455027a40
author rnateam
date Tue, 26 Jan 2016 04:38:27 -0500
parents 570a7de9f151
children 4bedd35bcdff
comparison
equal deleted inserted replaced
49:303f6402a035 50:0b9aab6aaebf
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2
3 import argparse
4 import logging
5 from sys import stdout
6 import pandas as pd
7 from subprocess import check_call
8 from shutil import rmtree
9 from tempfile import mkdtemp
10 from os.path import isfile
11 # avoid ugly python IOError when stdout output is piped into another program
12 # and then truncated (such as piping to head)
13 from signal import signal, SIGPIPE, SIG_DFL
14 signal(SIGPIPE, SIG_DFL)
2 15
3 tool_description = """ 16 tool_description = """
4 Merge PCR duplicates according to random barcode library. 17 Merge PCR duplicates according to random barcode library.
5 18
6 Barcodes containing uncalled base 'N' are removed. By default output is written 19 Barcodes containing uncalled base 'N' are removed. By default output is written
26 License: Apache 39 License: Apache
27 Email: maticzkd@informatik.uni-freiburg.de 40 Email: maticzkd@informatik.uni-freiburg.de
28 Status: Testing 41 Status: Testing
29 """ 42 """
30 43
31 import argparse
32 import logging
33 from sys import stdout
34 from Bio import SeqIO
35 import pandas as pd
36
37 # avoid ugly python IOError when stdout output is piped into another program
38 # and then truncated (such as piping to head)
39 from signal import signal, SIGPIPE, SIG_DFL
40 signal(SIGPIPE, SIG_DFL)
41
42
43 def fasta_tuple_generator(fasta_iterator):
44 "Yields id, sequence tuples given an iterator over Biopython SeqIO objects."
45 for record in input_seq_iterator:
46 yield (record.id, str(record.seq))
47
48
49 # parse command line arguments 44 # parse command line arguments
50 parser = argparse.ArgumentParser(description=tool_description, 45 parser = argparse.ArgumentParser(description=tool_description,
51 epilog=epilog, 46 epilog=epilog,
52 formatter_class=argparse.RawDescriptionHelpFormatter) 47 formatter_class=argparse.RawDescriptionHelpFormatter)
53 # positional arguments 48 # positional arguments
54 parser.add_argument( 49 parser.add_argument(
55 "alignments", 50 "alignments",
56 help="Path to bed6 file containing alignments.") 51 help="Path to bed6 file containing alignments.")
57 parser.add_argument( 52 parser.add_argument(
58 "bclib", 53 "bclib",
59 help="Path to fasta barcode library.") 54 help="Path to fastq barcode library.")
60 # optional arguments 55 # optional arguments
61 parser.add_argument( 56 parser.add_argument(
62 "-o", "--outfile", 57 "-o", "--outfile",
63 help="Write results to this file.") 58 help="Write results to this file.")
64 parser.add_argument(
65 "--fasta-library",
66 dest="fasta_library",
67 action="store_true",
68 help="Read random barcode library as fasta format.")
69 # misc arguments 59 # misc arguments
70 parser.add_argument( 60 parser.add_argument(
71 "-v", "--verbose", 61 "-v", "--verbose",
72 help="Be verbose.", 62 help="Be verbose.",
73 action="store_true") 63 action="store_true")
76 help="Print lots of debugging information", 66 help="Print lots of debugging information",
77 action="store_true") 67 action="store_true")
78 parser.add_argument( 68 parser.add_argument(
79 '--version', 69 '--version',
80 action='version', 70 action='version',
81 version='0.1.0') 71 version='0.2.0')
82 72
83 args = parser.parse_args() 73 args = parser.parse_args()
84 74
85 if args.debug: 75 if args.debug:
86 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s") 76 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(filename)s - %(levelname)s - %(message)s")
94 if args.outfile: 84 if args.outfile:
95 logging.info(" outfile: enabled writing to file") 85 logging.info(" outfile: enabled writing to file")
96 logging.info(" outfile: '{}'".format(args.outfile)) 86 logging.info(" outfile: '{}'".format(args.outfile))
97 logging.info("") 87 logging.info("")
98 88
99 # load barcode library into dictionary 89 # see if alignments are empty and the tool can quit
100 input_handle = open(args.bclib, "rU") 90 n_alns = sum(1 for line in open(args.alignments))
101 if args.fasta_library: 91 if n_alns == 0:
102 input_seq_iterator = SeqIO.parse(input_handle, "fasta") 92 logging.warning("WARNING: Working on empty set of alignments, writing empty output.")
103 else: 93 eventalnout = (open(args.outfile, "w") if args.outfile is not None else stdout)
104 input_seq_iterator = SeqIO.parse(input_handle, "fastq") 94 eventalnout.close()
105 bcs = pd.DataFrame.from_records( 95 exit(0)
106 data=fasta_tuple_generator(input_seq_iterator),
107 columns=["read_id", "bc"])
108 96
109 # load alignments 97 # check input filenames
110 alns = pd.read_csv( 98 if not isfile(args.bclib):
111 args.alignments, 99 raise Exception("ERROR: barcode library '{}' not found.")
112 sep="\t", 100 if not isfile(args.alignments):
113 names=["chrom", "start", "stop", "read_id", "score", "strand"]) 101 raise Exception("ERROR: alignments '{}' not found.")
114 102
115 # keep id parts up to first whitespace 103 try:
116 alns["read_id"] = alns["read_id"].str.split(' ').str.get(0) 104 tmpdir = mkdtemp()
105 logging.debug("tmpdir: " + tmpdir)
117 106
118 # combine barcode library and alignments 107 # prepare barcode library
119 bcalib = pd.merge( 108 syscall1 = "cat " + args.bclib + " | awk 'BEGIN{OFS=\"\\t\"}NR%4==1{gsub(/^@/,\"\"); id=$1}NR%4==2{bc=$1}NR%4==3{print id,bc}' | sort -k1,1 > " + tmpdir + "/bclib.csv"
120 bcs, alns, 109 check_call(syscall1, shell=True)
121 on="read_id", 110
122 how="inner", 111 # prepare alinments
123 sort=False) 112 syscall2 = "cat " + args.alignments + " | awk -F \"\\t\" 'BEGIN{OFS=\"\\t\"}{split($4, a, \" \"); $4 = a[1]; print}'| sort -k4,4 > " + tmpdir + "/alns.csv"
113 check_call(syscall2, shell=True)
114
115 # join barcode library and alignments
116 syscall3 = "join -1 1 -2 4 " + tmpdir + "/bclib.csv " + tmpdir + "/alns.csv " + " | awk 'BEGIN{OFS=\"\\t\"}{print $3,$4,$5,$2,$6,$7}' > " + tmpdir + "/bcalib.csv"
117 check_call(syscall3, shell=True)
118
119 # get alignments combined with barcodes
120 bcalib = pd.read_csv(
121 tmpdir + "/bcalib.csv",
122 sep="\t",
123 names=["chrom", "start", "stop", "bc", "score", "strand"])
124 finally:
125 logging.debug("removed tmpdir: " + tmpdir)
126 rmtree(tmpdir)
127
128 # fail if alignments given but combined library is empty
124 if bcalib.empty: 129 if bcalib.empty:
125 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.") 130 raise Exception("ERROR: no common entries for alignments and barcode library found. Please check your input files.")
126 n_alns = len(alns.index) 131
132 # warn if not all alignments could be assigned a barcode
127 n_bcalib = len(bcalib.index) 133 n_bcalib = len(bcalib.index)
128 if n_bcalib < n_alns: 134 if n_bcalib < n_alns:
129 logging.warning( 135 logging.warning(
130 "{} of {} alignments could not be associated with a random barcode.".format( 136 "{} of {} alignments could not be associated with a random barcode.".format(n_alns - n_bcalib, n_alns))
131 n_alns - n_bcalib, n_alns))
132 137
133 # remove entries with barcodes that has uncalled base N 138 # remove entries with barcodes that has uncalled base N
134 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index) 139 bcalib_cleaned = bcalib.drop(bcalib[bcalib.bc.str.contains("N")].index)
135 n_bcalib_cleaned = len(bcalib_cleaned) 140 n_bcalib_cleaned = len(bcalib_cleaned)
136 if n_bcalib_cleaned < n_bcalib: 141 # if n_bcalib_cleaned < n_bcalib:
137 msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format( 142 # msg = "{} of {} alignments had random barcodes containing uncalled bases and were dropped.".format(
138 n_bcalib - n_bcalib_cleaned, n_bcalib) 143 # n_bcalib - n_bcalib_cleaned, n_bcalib)
139 if n_bcalib_cleaned < (0.8 * n_bcalib): 144 # if n_bcalib_cleaned < (0.8 * n_bcalib):
140 logging.warning(msg) 145 # logging.warning(msg)
141 else: 146 # else:
142 logging.info(msg) 147 # logging.info(msg)
143 148
144 # count and merge pcr duplicates 149 # count and merge pcr duplicates
145 # grouping sorts by keys, so the ouput will be properly sorted 150 # grouping sorts by keys, so the ouput will be properly sorted
146 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index() 151 merged = bcalib_cleaned.groupby(['chrom', 'start', 'stop', 'strand', 'bc']).size().reset_index()
147 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True) 152 merged.rename(columns={0: 'ndupes'}, copy=False, inplace=True)