Mercurial > repos > mvdbeek > size_distribution
annotate size_distributions.py @ 0:ac5584567084 draft
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
author | mvdbeek |
---|---|
date | Mon, 20 Aug 2018 12:39:15 -0400 |
parents | |
children | 21b5a9170b90 |
rev | line source |
---|---|
0
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
1 from collections import ( |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
2 Counter, |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
3 OrderedDict |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
4 ) |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
5 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
6 import click |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
7 import pandas as pd |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
8 import pysam |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
9 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
10 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
11 def global_size_distribution(alignment_file, minimum_size=18, maximum_size=30): |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
12 reference_counters = OrderedDict() |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
13 for reference in alignment_file.references: |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
14 sense_counter = Counter() |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
15 antisense_counter = Counter() |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
16 for i in range(minimum_size, maximum_size + 1): |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
17 sense_counter[i] = 0 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
18 antisense_counter[i] = 0 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
19 reference_counters[reference] = {'sense': sense_counter, |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
20 'antisense': antisense_counter} |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
21 for i, read in enumerate(alignment_file): |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
22 if not read.is_unmapped: |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
23 reference = read.reference_name |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
24 readlength = read.query_alignment_length |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
25 if minimum_size - 1 < readlength < maximum_size + 1: |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
26 if read.is_reverse: |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
27 reference_counters[reference]['antisense'][readlength] += 1 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
28 else: |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
29 reference_counters[reference]['sense'][readlength] += 1 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
30 df = pd.Panel(reference_counters).to_frame() |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
31 df.index.names = ['readlength', 'orientation'] |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
32 return df |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
33 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
34 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
35 def to_long(df): |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
36 df = df.reset_index() |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
37 df = df.melt(id_vars=('readlength', 'orientation')) |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
38 df.columns = ['readlength', 'orientation', 'reference', 'count'] |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
39 return df |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
40 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
41 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
42 def write_table(df, output_path): |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
43 df.to_csv(output_path, sep="\t") |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
44 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
45 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
46 @click.command() |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
47 @click.argument('alignment_path', type=click.Path(exists=True)) |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
48 @click.option('--minimum_size', default=18, type=int, help="Minimum readlength to consider") |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
49 @click.option('--maximum_size', default=30, type=int, help="Minimum readlength to consider") |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
50 @click.option('--wide/--long', default=False, help="Output wide or long format.") |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
51 @click.option('--output', default="/dev/stdout", help="Write to this file") |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
52 def size_dist(alignment_path, minimum_size=18, maximum_size=30, output="/dev/stdout", wide=False): |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
53 """Calculate size distribution and orientation""" |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
54 with pysam.AlignmentFile(alignment_path) as alignment_file: |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
55 df = global_size_distribution(alignment_file, minimum_size, maximum_size) |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
56 if not wide: |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
57 df = to_long(df) |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
58 write_table(df, output) |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
59 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
60 |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
61 if __name__ == '__main__': |
ac5584567084
planemo upload for repository https://github.com/bardin-lab/smallRNA_tools commit 48d9f377e28d686c2e0d8eee6684e34c52b7b3cc
mvdbeek
parents:
diff
changeset
|
62 size_dist() |