view size_distributions.py @ 1:3ed18dcb6f82 draft

planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit a0c9f33672103cf29ad015477ebb898007ba9ae8
author mvdbeek
date Mon, 20 Aug 2018 13:00:14 -0400
parents ac5584567084
children 21b5a9170b90
line wrap: on
line source

from collections import (
    Counter,
    OrderedDict
)

import click
import pandas as pd
import pysam


def global_size_distribution(alignment_file, minimum_size=18, maximum_size=30):
    reference_counters = OrderedDict()
    for reference in alignment_file.references:
        sense_counter = Counter()
        antisense_counter = Counter()
        for i in range(minimum_size, maximum_size + 1):
            sense_counter[i] = 0
            antisense_counter[i] = 0
        reference_counters[reference] = {'sense': sense_counter,
                                         'antisense': antisense_counter}
    for i, read in enumerate(alignment_file):
        if not read.is_unmapped:
            reference = read.reference_name
            readlength = read.query_alignment_length
            if minimum_size - 1 < readlength < maximum_size + 1:
                if read.is_reverse:
                    reference_counters[reference]['antisense'][readlength] += 1
                else:
                    reference_counters[reference]['sense'][readlength] += 1
    df = pd.Panel(reference_counters).to_frame()
    df.index.names = ['readlength', 'orientation']
    return df


def to_long(df):
    df = df.reset_index()
    df = df.melt(id_vars=('readlength', 'orientation'))
    df.columns = ['readlength', 'orientation', 'reference', 'count']
    return df


def write_table(df, output_path):
    df.to_csv(output_path, sep="\t")


@click.command()
@click.argument('alignment_path', type=click.Path(exists=True))
@click.option('--minimum_size', default=18, type=int, help="Minimum readlength to consider")
@click.option('--maximum_size', default=30, type=int, help="Minimum readlength to consider")
@click.option('--wide/--long', default=False, help="Output wide or long format.")
@click.option('--output', default="/dev/stdout", help="Write to this file")
def size_dist(alignment_path, minimum_size=18, maximum_size=30, output="/dev/stdout", wide=False):
    """Calculate size distribution and orientation"""
    with pysam.AlignmentFile(alignment_path) as alignment_file:
        df = global_size_distribution(alignment_file, minimum_size, maximum_size)
        if not wide:
            df = to_long(df)
        write_table(df, output)


if __name__ == '__main__':
    size_dist()