annotate size_distributions.py @ 0:ac5584567084 draft

planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
author mvdbeek
date Mon, 20 Aug 2018 12:39:15 -0400
parents
children 21b5a9170b90
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
1 from collections import (
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
2 Counter,
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
3 OrderedDict
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
4 )
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
5
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
6 import click
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
7 import pandas as pd
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
8 import pysam
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
9
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
10
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
11 def global_size_distribution(alignment_file, minimum_size=18, maximum_size=30):
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
12 reference_counters = OrderedDict()
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
13 for reference in alignment_file.references:
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
14 sense_counter = Counter()
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
15 antisense_counter = Counter()
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
16 for i in range(minimum_size, maximum_size + 1):
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
17 sense_counter[i] = 0
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
18 antisense_counter[i] = 0
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
19 reference_counters[reference] = {'sense': sense_counter,
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
20 'antisense': antisense_counter}
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
21 for i, read in enumerate(alignment_file):
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
22 if not read.is_unmapped:
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
23 reference = read.reference_name
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
24 readlength = read.query_alignment_length
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
25 if minimum_size - 1 < readlength < maximum_size + 1:
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
26 if read.is_reverse:
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
27 reference_counters[reference]['antisense'][readlength] += 1
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
28 else:
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
29 reference_counters[reference]['sense'][readlength] += 1
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
30 df = pd.Panel(reference_counters).to_frame()
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
31 df.index.names = ['readlength', 'orientation']
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
32 return df
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
33
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
34
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
35 def to_long(df):
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
36 df = df.reset_index()
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
37 df = df.melt(id_vars=('readlength', 'orientation'))
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
38 df.columns = ['readlength', 'orientation', 'reference', 'count']
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
39 return df
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
40
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
41
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
42 def write_table(df, output_path):
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
43 df.to_csv(output_path, sep="\t")
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
44
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
45
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
46 @click.command()
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
47 @click.argument('alignment_path', type=click.Path(exists=True))
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
48 @click.option('--minimum_size', default=18, type=int, help="Minimum readlength to consider")
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
49 @click.option('--maximum_size', default=30, type=int, help="Minimum readlength to consider")
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
50 @click.option('--wide/--long', default=False, help="Output wide or long format.")
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
51 @click.option('--output', default="/dev/stdout", help="Write to this file")
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
52 def size_dist(alignment_path, minimum_size=18, maximum_size=30, output="/dev/stdout", wide=False):
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
53 """Calculate size distribution and orientation"""
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
54 with pysam.AlignmentFile(alignment_path) as alignment_file:
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
55 df = global_size_distribution(alignment_file, minimum_size, maximum_size)
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
56 if not wide:
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
57 df = to_long(df)
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
58 write_table(df, output)
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
59
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
60
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
61 if __name__ == '__main__':
ac5584567084 planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff changeset
62 size_dist()