Mercurial > repos > mvdbeek > size_distribution
view size_distributions.py @ 4:f1eeaf42144b draft default tip
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit c8e0a703fcdff580ba0a0c5806a37c088c03ab7b-dirty
author | mvdbeek |
---|---|
date | Mon, 20 Aug 2018 14:46:57 -0400 |
parents | 21b5a9170b90 |
children |
line wrap: on
line source
from collections import ( Counter, OrderedDict ) import click import pandas as pd import pysam def global_size_distribution(alignment_file, minimum_size=18, maximum_size=30): reference_counters = OrderedDict() for reference in alignment_file.references: sense_counter = Counter() antisense_counter = Counter() for i in range(minimum_size, maximum_size + 1): sense_counter[i] = 0 antisense_counter[i] = 0 reference_counters[reference] = {'sense': sense_counter, 'antisense': antisense_counter} for i, read in enumerate(alignment_file): if not read.is_unmapped: reference = read.reference_name readlength = read.query_alignment_length if minimum_size - 1 < readlength < maximum_size + 1: if read.is_reverse: reference_counters[reference]['antisense'][readlength] += 1 else: reference_counters[reference]['sense'][readlength] += 1 df = pd.Panel(reference_counters).to_frame() df.index.names = ['readlength', 'orientation'] return i, df def to_long(df, total_count): df = df.reset_index() df = df.melt(id_vars=('readlength', 'orientation')) df.columns = ['readlength', 'orientation', 'reference', 'count'] df['TPM'] = df['count'] / total_count * 10 ** 6 return df def write_table(df, output_path): df.to_csv(output_path, sep="\t", index=False) @click.command() @click.argument('alignment_path', type=click.Path(exists=True)) @click.option('--minimum_size', default=18, type=int, help="Minimum readlength to consider") @click.option('--maximum_size', default=30, type=int, help="Minimum readlength to consider") @click.option('--wide/--long', default=False, help="Output wide or long format.") @click.option('--output', default="/dev/stdout", help="Write to this file") def size_dist(alignment_path, minimum_size=18, maximum_size=30, output="/dev/stdout", wide=False): """Calculate size distribution and orientation""" with pysam.AlignmentFile(alignment_path) as alignment_file: total_count, df = global_size_distribution(alignment_file, minimum_size, maximum_size) if not wide: df = to_long(df, total_count) write_table(df, output) if __name__ == '__main__': size_dist()