view barcode_collapse_pe.py @ 0:38a8018e57a5 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/barcode_collapse commit cb3ef4df8ea5a705039b32f6959d7513462cfcfc
author bgruening
date Sat, 11 Jun 2016 10:46:34 -0400
parents
children
line wrap: on
line source

__author__ = 'gpratt'
"""

barcode_collapse.py  read in a .bam file where the
first 9 nt of the read name
are the barcode and merge reads mapped to the same position that have the same barcode

"""

from collections import Counter
import itertools
from optparse import OptionParser
import sys
import pysam


def stranded_read_start(read):
    if read.is_reverse:
        return read.positions[-1]
    else:
        return read.pos


def output_metrics(metrics_file, total_count, removed_count):
    with open(metrics_file, 'w') as metrics:
        metrics.write("\t".join(["randomer", "total_count", "removed_count"]) + "\n")
        for barcode in total_count.keys():
            metrics.write("\t".join(map(str, [barcode, total_count[barcode], removed_count[barcode]])) + "\n")


def barcode_collapse(in_bam, out_bam):
    number_of_unmapped_mate_pairs = 0
    different_chroms = 0
    removed_count = Counter()
    total_count = Counter()
    result_dict = {}

    with pysam.Samfile(in_bam, 'r') as samfile1:
        with pysam.Samfile(in_bam, 'r') as samfile2:
            samfile_read1 = itertools.islice(samfile1, 0, None, 2)
            samfile_read2 = itertools.islice(samfile2, 1, None, 2)
            for read1, read2 in itertools.izip(samfile_read1, samfile_read2):
                if not read1.qname == read2.qname:
                    print read1.qname, read2.qname
                    raise Exception("Read Names don't match")
                if read1.is_unmapped and read1.is_unmapped:
                    #Both reads don't map, don't even both saving them.
                    continue
                if (not read1.is_unmapped and read2.is_unmapped) or (read1.is_unmapped and read2.is_unmapped):
                    number_of_unmapped_mate_pairs += 1
                    continue
                if read1.rname != read2.rname:
                    different_chroms += 1
                    continue
                randomer = read1.qname.split(":")[0]

                start = stranded_read_start(read1)
                stop = stranded_read_start(read2)
                strand = "-" if read1.is_reverse else "+"
                unique_location = (read1.rname, start, stop, strand, randomer)
                total_count[randomer] += 1
                if unique_location in result_dict:
                    removed_count[randomer] += 1
                    continue

                result_dict[(read1.rname, start, stop, strand, randomer)] = (read1, read2)

        with pysam.Samfile(out_bam, 'wb', template=samfile1) as out_bam:
            for key, (read1, read2) in result_dict.items():
                out_bam.write(read1)
                out_bam.write(read2)
    return total_count, removed_count

if __name__ == "__main__":
    description=""""Paired End randomer aware duplciate removal algorithm."""
    usage="""Assumes paired end reads are adjectent to each other in output file (ie only provide unsorted bams)
             Also assumes no multimappers in the bam file, if there are multimappers behavior is undefined"""
    parser = OptionParser(usage=usage, description=description)
    parser.add_option("-b", "--bam", dest="bam", help="bam file to barcode collapse")
    parser.add_option("-o", "--out_file", dest="out_file")
    parser.add_option("-m", "--metrics_file", dest="metrics_file")
    (options, args) = parser.parse_args()

    if not (options.bam.endswith(".bam")):
        raise TypeError("%s, not bam file" % options.bam)

    total_count, removed_count = barcode_collapse(options.bam, options.out_file)
    output_metrics(options.metrics_file, total_count, removed_count)

    sys.exit(0)