diff barcode_collapse_pe.py @ 0:38a8018e57a5 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/barcode_collapse commit cb3ef4df8ea5a705039b32f6959d7513462cfcfc
author bgruening
date Sat, 11 Jun 2016 10:46:34 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/barcode_collapse_pe.py	Sat Jun 11 10:46:34 2016 -0400
@@ -0,0 +1,90 @@
+__author__ = 'gpratt'
+"""
+
+barcode_collapse.py  read in a .bam file where the
+first 9 nt of the read name
+are the barcode and merge reads mapped to the same position that have the same barcode
+
+"""
+
+from collections import Counter
+import itertools
+from optparse import OptionParser
+import sys
+import pysam
+
+
+def stranded_read_start(read):
+    if read.is_reverse:
+        return read.positions[-1]
+    else:
+        return read.pos
+
+
+def output_metrics(metrics_file, total_count, removed_count):
+    with open(metrics_file, 'w') as metrics:
+        metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n")
+        for barcode in total_count.keys():
+            metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n")
+
+
+def barcode_collapse(in_bam, out_bam):
+    number_of_unmapped_mate_pairs = 0
+    different_chroms = 0
+    removed_count = Counter()
+    total_count = Counter()
+    result_dict = {}
+
+    with pysam.Samfile(in_bam, 'r') as samfile1:
+        with pysam.Samfile(in_bam, 'r') as samfile2:
+            samfile_read1 = itertools.islice(samfile1, 0, None, 2)
+            samfile_read2 = itertools.islice(samfile2, 1, None, 2)
+            for read1, read2 in itertools.izip(samfile_read1, samfile_read2):
+                if not read1.qname == read2.qname:
+                    print read1.qname, read2.qname
+                    raise Exception("Read Names don't match")
+                if read1.is_unmapped and read1.is_unmapped:
+                    #Both reads don't map, don't even both saving them.
+                    continue
+                if (not read1.is_unmapped and read2.is_unmapped) or (read1.is_unmapped and read2.is_unmapped):
+                    number_of_unmapped_mate_pairs += 1
+                    continue
+                if read1.rname != read2.rname:
+                    different_chroms += 1
+                    continue
+                randomer = read1.qname.split(":")[0]
+
+                start = stranded_read_start(read1)
+                stop = stranded_read_start(read2)
+                strand = "-" if read1.is_reverse else "+"
+                unique_location = (read1.rname, start, stop, strand, randomer)
+                total_count[randomer] += 1
+                if unique_location in result_dict:
+                    removed_count[randomer] += 1
+                    continue
+
+                result_dict[(read1.rname, start, stop, strand, randomer)] = (read1, read2)
+
+        with pysam.Samfile(out_bam, 'wb', template=samfile1) as out_bam:
+            for key, (read1, read2) in result_dict.items():
+                out_bam.write(read1)
+                out_bam.write(read2)
+    return total_count, removed_count
+
+if __name__ == "__main__":
+    description=""""Paired End randomer aware duplciate removal algorithm."""
+    usage="""Assumes paired end reads are adjectent to each other in output file (ie only provide unsorted bams)
+             Also assumes no multimappers in the bam file, if there are multimappers behavior is undefined"""
+    parser = OptionParser(usage=usage, description=description)
+    parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse")
+    parser.add_option("-o", "--out_file", dest="out_file")
+    parser.add_option("-m", "--metrics_file", dest="metrics_file")
+    (options, args) = parser.parse_args()
+
+    if not (options.bam.endswith(".bam")):
+        raise TypeError("%s, not bam file" % options.bam)
+
+    total_count, removed_count = barcode_collapse(options.bam, options.out_file)
+    output_metrics(options.metrics_file, total_count, removed_count)
+
+    sys.exit(0)