Mercurial > repos > mvdbeek > size_distribution
changeset 0:ac5584567084 draft
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
author | mvdbeek |
---|---|
date | Mon, 20 Aug 2018 12:39:15 -0400 |
parents | |
children | 3ed18dcb6f82 |
files | alignment_size_distribution.xml size_distributions.py test-data/distribution.tab test-data/test_data.bam |
diffstat | 4 files changed, 117 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/alignment_size_distribution.xml Mon Aug 20 12:39:15 2018 -0400 @@ -0,0 +1,28 @@ +<tool id="alignment_size_distribution" name="Create size distribution for alignment files" version="0.1.0"> + <requirements> + <requirement type="package" version="6.7">click</requirement> + <requirement type="package" version="0.23">pandas</requirement> + <requirement type="package" version="0.15">pysam</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python '/size_distributions.py' '$alignment_file' --minimum_size $minimum_size --maximum_size $maximum_size $wide --output '$distribution' + ]]></command> + <inputs> + <param name="alignment_file" type="data" format="bam,unsorted.bam"/> + <param argument="--minimum_size" type="integer" value="18" min="1" label="Minimum read length to consider"/> + <param argument="--maximum_size" type="integer" value="18" min="1" label="Maximum read length to consider"/> + <param argument="--wide" type="boolean" checked="false" truevalue="--wide" label="Output wide instead of long tabular format"/> + </inputs> + <outputs> + <data name="distribution" format="tabular"/> + </outputs> + <tests> + <test> + <param name="alignment_file" value="test_data.bam" ftype="bam"/> + <output name="distribution" value="distribution.tab" ftype="tabular"/> + </test> + </tests> + <help><![CDATA[ +Generates a table with reference sequence, sense and antisense counts. + ]]></help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/size_distributions.py Mon Aug 20 12:39:15 2018 -0400 @@ -0,0 +1,62 @@ +from collections import ( + Counter, + OrderedDict +) + +import click +import pandas as pd +import pysam + + +def global_size_distribution(alignment_file, minimum_size=18, maximum_size=30): + reference_counters = OrderedDict() + for reference in alignment_file.references: + sense_counter = Counter() + antisense_counter = Counter() + for i in range(minimum_size, maximum_size + 1): + sense_counter[i] = 0 + antisense_counter[i] = 0 + reference_counters[reference] = {'sense': sense_counter, + 'antisense': antisense_counter} + for i, read in enumerate(alignment_file): + if not read.is_unmapped: + reference = read.reference_name + readlength = read.query_alignment_length + if minimum_size - 1 < readlength < maximum_size + 1: + if read.is_reverse: + reference_counters[reference]['antisense'][readlength] += 1 + else: + reference_counters[reference]['sense'][readlength] += 1 + df = pd.Panel(reference_counters).to_frame() + df.index.names = ['readlength', 'orientation'] + return df + + +def to_long(df): + df = df.reset_index() + df = df.melt(id_vars=('readlength', 'orientation')) + df.columns = ['readlength', 'orientation', 'reference', 'count'] + return df + + +def write_table(df, output_path): + df.to_csv(output_path, sep="\t") + + +@click.command() +@click.argument('alignment_path', type=click.Path(exists=True)) +@click.option('--minimum_size', default=18, type=int, help="Minimum readlength to consider") +@click.option('--maximum_size', default=30, type=int, help="Minimum readlength to consider") +@click.option('--wide/--long', default=False, help="Output wide or long format.") +@click.option('--output', default="/dev/stdout", help="Write to this file") +def size_dist(alignment_path, minimum_size=18, maximum_size=30, output="/dev/stdout", wide=False): + """Calculate size distribution and orientation""" + with pysam.AlignmentFile(alignment_path) as alignment_file: + df = global_size_distribution(alignment_file, minimum_size, maximum_size) + if not wide: + df = to_long(df) + write_table(df, output) + + +if __name__ == '__main__': + size_dist()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/distribution.tab Mon Aug 20 12:39:15 2018 -0400 @@ -0,0 +1,27 @@ +readlength orientation FBgn0026065_Idefix +18 antisense 6 +18 sense 3 +19 antisense 7 +19 sense 1 +20 antisense 6 +20 sense 4 +21 antisense 6 +21 sense 6 +22 antisense 5 +22 sense 2 +23 antisense 5 +23 sense 2 +24 antisense 2 +24 sense 3 +25 antisense 3 +25 sense 0 +26 antisense 3 +26 sense 4 +27 antisense 6 +27 sense 3 +28 antisense 2 +28 sense 2 +29 antisense 0 +29 sense 3 +30 antisense 0 +30 sense 1