changeset 0:ac5584567084 draft

planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
author mvdbeek
date Mon, 20 Aug 2018 12:39:15 -0400
parents
children 3ed18dcb6f82
files alignment_size_distribution.xml size_distributions.py test-data/distribution.tab test-data/test_data.bam
diffstat 4 files changed, 117 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/alignment_size_distribution.xml	Mon Aug 20 12:39:15 2018 -0400
@@ -0,0 +1,28 @@
+<tool id="alignment_size_distribution" name="Create size distribution for alignment files" version="0.1.0">
+    <requirements>
+        <requirement type="package" version="6.7">click</requirement>
+        <requirement type="package" version="0.23">pandas</requirement>
+        <requirement type="package" version="0.15">pysam</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+        python '/size_distributions.py' '$alignment_file' --minimum_size $minimum_size --maximum_size $maximum_size $wide --output '$distribution'
+    ]]></command>
+    <inputs>
+        <param name="alignment_file" type="data" format="bam,unsorted.bam"/>
+        <param argument="--minimum_size" type="integer" value="18" min="1" label="Minimum read length to consider"/>
+        <param argument="--maximum_size" type="integer" value="18" min="1" label="Maximum read length to consider"/>
+        <param argument="--wide" type="boolean" checked="false" truevalue="--wide" label="Output wide instead of long tabular format"/>
+    </inputs>
+    <outputs>
+        <data name="distribution" format="tabular"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="alignment_file" value="test_data.bam" ftype="bam"/>
+            <output name="distribution" value="distribution.tab" ftype="tabular"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Generates a table with reference sequence, sense and antisense counts.
+    ]]></help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/size_distributions.py	Mon Aug 20 12:39:15 2018 -0400
@@ -0,0 +1,62 @@
+from collections import (
+    Counter,
+    OrderedDict
+)
+
+import click
+import pandas as pd
+import pysam
+
+
+def global_size_distribution(alignment_file, minimum_size=18, maximum_size=30):
+    reference_counters = OrderedDict()
+    for reference in alignment_file.references:
+        sense_counter = Counter()
+        antisense_counter = Counter()
+        for i in range(minimum_size, maximum_size + 1):
+            sense_counter[i] = 0
+            antisense_counter[i] = 0
+        reference_counters[reference] = {'sense': sense_counter,
+                                         'antisense': antisense_counter}
+    for i, read in enumerate(alignment_file):
+        if not read.is_unmapped:
+            reference = read.reference_name
+            readlength = read.query_alignment_length
+            if minimum_size - 1 < readlength < maximum_size + 1:
+                if read.is_reverse:
+                    reference_counters[reference]['antisense'][readlength] += 1
+                else:
+                    reference_counters[reference]['sense'][readlength] += 1
+    df = pd.Panel(reference_counters).to_frame()
+    df.index.names = ['readlength', 'orientation']
+    return df
+
+
+def to_long(df):
+    df = df.reset_index()
+    df = df.melt(id_vars=('readlength', 'orientation'))
+    df.columns = ['readlength', 'orientation', 'reference', 'count']
+    return df
+
+
+def write_table(df, output_path):
+    df.to_csv(output_path, sep="\t")
+
+
+@click.command()
+@click.argument('alignment_path', type=click.Path(exists=True))
+@click.option('--minimum_size', default=18, type=int, help="Minimum readlength to consider")
+@click.option('--maximum_size', default=30, type=int, help="Minimum readlength to consider")
+@click.option('--wide/--long', default=False, help="Output wide or long format.")
+@click.option('--output', default="/dev/stdout", help="Write to this file")
+def size_dist(alignment_path, minimum_size=18, maximum_size=30, output="/dev/stdout", wide=False):
+    """Calculate size distribution and orientation"""
+    with pysam.AlignmentFile(alignment_path) as alignment_file:
+        df = global_size_distribution(alignment_file, minimum_size, maximum_size)
+        if not wide:
+            df = to_long(df)
+        write_table(df, output)
+
+
+if __name__ == '__main__':
+    size_dist()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/distribution.tab	Mon Aug 20 12:39:15 2018 -0400
@@ -0,0 +1,27 @@
+readlength	orientation	FBgn0026065_Idefix
+18	antisense	6
+18	sense	3
+19	antisense	7
+19	sense	1
+20	antisense	6
+20	sense	4
+21	antisense	6
+21	sense	6
+22	antisense	5
+22	sense	2
+23	antisense	5
+23	sense	2
+24	antisense	2
+24	sense	3
+25	antisense	3
+25	sense	0
+26	antisense	3
+26	sense	4
+27	antisense	6
+27	sense	3
+28	antisense	2
+28	sense	2
+29	antisense	0
+29	sense	3
+30	antisense	0
+30	sense	1
Binary file test-data/test_data.bam has changed