comparison barcode_collapse_pe.py @ 0:38a8018e57a5 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/barcode_collapse commit cb3ef4df8ea5a705039b32f6959d7513462cfcfc
author bgruening
date Sat, 11 Jun 2016 10:46:34 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:38a8018e57a5
1 __author__ = 'gpratt'
2 """
3
4 barcode_collapse.py read in a .bam file where the
5 first 9 nt of the read name
6 are the barcode and merge reads mapped to the same position that have the same barcode
7
8 """
9
10 from collections import Counter
11 import itertools
12 from optparse import OptionParser
13 import sys
14 import pysam
15
16
17 def stranded_read_start(read):
18 if read.is_reverse:
19 return read.positions[-1]
20 else:
21 return read.pos
22
23
24 def output_metrics(metrics_file, total_count, removed_count):
25 with open(metrics_file, 'w') as metrics:
26 metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n")
27 for barcode in total_count.keys():
28 metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n")
29
30
31 def barcode_collapse(in_bam, out_bam):
32 number_of_unmapped_mate_pairs = 0
33 different_chroms = 0
34 removed_count = Counter()
35 total_count = Counter()
36 result_dict = {}
37
38 with pysam.Samfile(in_bam, 'r') as samfile1:
39 with pysam.Samfile(in_bam, 'r') as samfile2:
40 samfile_read1 = itertools.islice(samfile1, 0, None, 2)
41 samfile_read2 = itertools.islice(samfile2, 1, None, 2)
42 for read1, read2 in itertools.izip(samfile_read1, samfile_read2):
43 if not read1.qname == read2.qname:
44 print read1.qname, read2.qname
45 raise Exception("Read Names don't match")
46 if read1.is_unmapped and read1.is_unmapped:
47 #Both reads don't map, don't even both saving them.
48 continue
49 if (not read1.is_unmapped and read2.is_unmapped) or (read1.is_unmapped and read2.is_unmapped):
50 number_of_unmapped_mate_pairs += 1
51 continue
52 if read1.rname != read2.rname:
53 different_chroms += 1
54 continue
55 randomer = read1.qname.split(":")[0]
56
57 start = stranded_read_start(read1)
58 stop = stranded_read_start(read2)
59 strand = "-" if read1.is_reverse else "+"
60 unique_location = (read1.rname, start, stop, strand, randomer)
61 total_count[randomer] += 1
62 if unique_location in result_dict:
63 removed_count[randomer] += 1
64 continue
65
66 result_dict[(read1.rname, start, stop, strand, randomer)] = (read1, read2)
67
68 with pysam.Samfile(out_bam, 'wb', template=samfile1) as out_bam:
69 for key, (read1, read2) in result_dict.items():
70 out_bam.write(read1)
71 out_bam.write(read2)
72 return total_count, removed_count
73
74 if __name__ == "__main__":
75 description=""""Paired End randomer aware duplciate removal algorithm."""
76 usage="""Assumes paired end reads are adjectent to each other in output file (ie only provide unsorted bams)
77 Also assumes no multimappers in the bam file, if there are multimappers behavior is undefined"""
78 parser = OptionParser(usage=usage, description=description)
79 parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse")
80 parser.add_option("-o", "--out_file", dest="out_file")
81 parser.add_option("-m", "--metrics_file", dest="metrics_file")
82 (options, args) = parser.parse_args()
83
84 if not (options.bam.endswith(".bam")):
85 raise TypeError("%s, not bam file" % options.bam)
86
87 total_count, removed_count = barcode_collapse(options.bam, options.out_file)
88 output_metrics(options.metrics_file, total_count, removed_count)
89
90 sys.exit(0)