diff size_distributions.py @ 0:ac5584567084 draft

planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
author mvdbeek
date Mon, 20 Aug 2018 12:39:15 -0400
parents
children 21b5a9170b90
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/size_distributions.py	Mon Aug 20 12:39:15 2018 -0400
@@ -0,0 +1,62 @@
+from collections import (
+    Counter,
+    OrderedDict
+)
+
+import click
+import pandas as pd
+import pysam
+
+
+def global_size_distribution(alignment_file, minimum_size=18, maximum_size=30):
+    reference_counters = OrderedDict()
+    for reference in alignment_file.references:
+        sense_counter = Counter()
+        antisense_counter = Counter()
+        for i in range(minimum_size, maximum_size + 1):
+            sense_counter[i] = 0
+            antisense_counter[i] = 0
+        reference_counters[reference] = {'sense': sense_counter,
+                                         'antisense': antisense_counter}
+    for i, read in enumerate(alignment_file):
+        if not read.is_unmapped:
+            reference = read.reference_name
+            readlength = read.query_alignment_length
+            if minimum_size - 1 < readlength < maximum_size + 1:
+                if read.is_reverse:
+                    reference_counters[reference]['antisense'][readlength] += 1
+                else:
+                    reference_counters[reference]['sense'][readlength] += 1
+    df = pd.Panel(reference_counters).to_frame()
+    df.index.names = ['readlength', 'orientation']
+    return df
+
+
+def to_long(df):
+    df = df.reset_index()
+    df = df.melt(id_vars=('readlength', 'orientation'))
+    df.columns = ['readlength', 'orientation', 'reference', 'count']
+    return df
+
+
+def write_table(df, output_path):
+    df.to_csv(output_path, sep="\t")
+
+
+@click.command()
+@click.argument('alignment_path', type=click.Path(exists=True))
+@click.option('--minimum_size', default=18, type=int, help="Minimum readlength to consider")
+@click.option('--maximum_size', default=30, type=int, help="Minimum readlength to consider")
+@click.option('--wide/--long', default=False, help="Output wide or long format.")
+@click.option('--output', default="/dev/stdout", help="Write to this file")
+def size_dist(alignment_path, minimum_size=18, maximum_size=30, output="/dev/stdout", wide=False):
+    """Calculate size distribution and orientation"""
+    with pysam.AlignmentFile(alignment_path) as alignment_file:
+        df = global_size_distribution(alignment_file, minimum_size, maximum_size)
+        if not wide:
+            df = to_long(df)
+        write_table(df, output)
+
+
+if __name__ == '__main__':
+    size_dist()