Mercurial > repos > bgruening > barcode_collapse
comparison barcode_collapse_pe.py @ 0:38a8018e57a5 draft default tip
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/barcode_collapse commit cb3ef4df8ea5a705039b32f6959d7513462cfcfc
| author | bgruening |
|---|---|
| date | Sat, 11 Jun 2016 10:46:34 -0400 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:38a8018e57a5 |
|---|---|
| 1 __author__ = 'gpratt' | |
| 2 """ | |
| 3 | |
| 4 barcode_collapse.py read in a .bam file where the | |
| 5 first 9 nt of the read name | |
| 6 are the barcode and merge reads mapped to the same position that have the same barcode | |
| 7 | |
| 8 """ | |
| 9 | |
| 10 from collections import Counter | |
| 11 import itertools | |
| 12 from optparse import OptionParser | |
| 13 import sys | |
| 14 import pysam | |
| 15 | |
| 16 | |
| 17 def stranded_read_start(read): | |
| 18 if read.is_reverse: | |
| 19 return read.positions[-1] | |
| 20 else: | |
| 21 return read.pos | |
| 22 | |
| 23 | |
| 24 def output_metrics(metrics_file, total_count, removed_count): | |
| 25 with open(metrics_file, 'w') as metrics: | |
| 26 metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n") | |
| 27 for barcode in total_count.keys(): | |
| 28 metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n") | |
| 29 | |
| 30 | |
| 31 def barcode_collapse(in_bam, out_bam): | |
| 32 number_of_unmapped_mate_pairs = 0 | |
| 33 different_chroms = 0 | |
| 34 removed_count = Counter() | |
| 35 total_count = Counter() | |
| 36 result_dict = {} | |
| 37 | |
| 38 with pysam.Samfile(in_bam, 'r') as samfile1: | |
| 39 with pysam.Samfile(in_bam, 'r') as samfile2: | |
| 40 samfile_read1 = itertools.islice(samfile1, 0, None, 2) | |
| 41 samfile_read2 = itertools.islice(samfile2, 1, None, 2) | |
| 42 for read1, read2 in itertools.izip(samfile_read1, samfile_read2): | |
| 43 if not read1.qname == read2.qname: | |
| 44 print read1.qname, read2.qname | |
| 45 raise Exception("Read Names don't match") | |
| 46 if read1.is_unmapped and read1.is_unmapped: | |
| 47 #Both reads don't map, don't even both saving them. | |
| 48 continue | |
| 49 if (not read1.is_unmapped and read2.is_unmapped) or (read1.is_unmapped and read2.is_unmapped): | |
| 50 number_of_unmapped_mate_pairs += 1 | |
| 51 continue | |
| 52 if read1.rname != read2.rname: | |
| 53 different_chroms += 1 | |
| 54 continue | |
| 55 randomer = read1.qname.split(":")[0] | |
| 56 | |
| 57 start = stranded_read_start(read1) | |
| 58 stop = stranded_read_start(read2) | |
| 59 strand = "-" if read1.is_reverse else "+" | |
| 60 unique_location = (read1.rname, start, stop, strand, randomer) | |
| 61 total_count[randomer] += 1 | |
| 62 if unique_location in result_dict: | |
| 63 removed_count[randomer] += 1 | |
| 64 continue | |
| 65 | |
| 66 result_dict[(read1.rname, start, stop, strand, randomer)] = (read1, read2) | |
| 67 | |
| 68 with pysam.Samfile(out_bam, 'wb', template=samfile1) as out_bam: | |
| 69 for key, (read1, read2) in result_dict.items(): | |
| 70 out_bam.write(read1) | |
| 71 out_bam.write(read2) | |
| 72 return total_count, removed_count | |
| 73 | |
| 74 if __name__ == "__main__": | |
| 75 description=""""Paired End randomer aware duplciate removal algorithm.""" | |
| 76 usage="""Assumes paired end reads are adjectent to each other in output file (ie only provide unsorted bams) | |
| 77 Also assumes no multimappers in the bam file, if there are multimappers behavior is undefined""" | |
| 78 parser = OptionParser(usage=usage, description=description) | |
| 79 parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse") | |
| 80 parser.add_option("-o", "--out_file", dest="out_file") | |
| 81 parser.add_option("-m", "--metrics_file", dest="metrics_file") | |
| 82 (options, args) = parser.parse_args() | |
| 83 | |
| 84 if not (options.bam.endswith(".bam")): | |
| 85 raise TypeError("%s, not bam file" % options.bam) | |
| 86 | |
| 87 total_count, removed_count = barcode_collapse(options.bam, options.out_file) | |
| 88 output_metrics(options.metrics_file, total_count, removed_count) | |
| 89 | |
| 90 sys.exit(0) |
