Mercurial > repos > dfornika > mob_suite
annotate distance_matrix_phylip.py @ 38:17a60dd45b31 draft
"planemo upload for repository https://github.com/phac-nml/mob-suite commit 608abbed8881523f97c0378e350f32243a754237-dirty"
| author | dfornika |
|---|---|
| date | Wed, 30 Oct 2019 23:38:12 -0400 |
| parents | 115b462224cf |
| children |
| rev | line source |
|---|---|
|
19
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
1 #!/usr/bin/env python |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
2 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
3 import argparse |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
4 import sys |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
5 import csv |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
6 import numpy as np |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
7 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
8 from Bio.Phylo.TreeConstruction import DistanceMatrix, DistanceTreeConstructor |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
9 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
10 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
11 def process_input_matrix(input_matrix): |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
12 """ Converts an array-of-arrays containting sample IDs and distances |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
13 into a BioPython DistanceMatrix object |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
14 """ |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
15 input_matrix.pop(0) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
16 sample_names = [row[0] for row in input_matrix] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
17 for row in input_matrix: |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
18 row.pop(0) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
19 distance_matrix = [] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
20 for input_matrix_row in input_matrix: |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
21 distance_matrix.append([float(i) for i in input_matrix_row]) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
22 """ np.tril() converts a matrix like this: [[0 1 2] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
23 [1 0 1] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
24 [2 1 0]] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
25 ...into this: [[0 0 0] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
26 [1 0 0] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
27 [2 1 0]] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
28 ...but what we need to pass to DistanceMatrix() is this: [[0] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
29 [1 0] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
30 [2 1 0]] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
31 ...so that's what the (somewhat cryptic) code below does. |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
32 """ |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
33 distance_matrix = np.tril(np.array(distance_matrix)) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
34 num_rows = distance_matrix.shape[0] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
35 """ masking the distance matrix with tril_indices gives a linearized |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
36 distance matrix [0 1 0 2 1 0] that we need to re-construct |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
37 into [[0], [1, 0], [2, 1, 0]] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
38 """ |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
39 lower_triangular_idx_mask = np.tril_indices(num_rows) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
40 linear_distance_matrix = distance_matrix[lower_triangular_idx_mask] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
41 distance_matrix = [] |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
42 min = 0 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
43 max = 1 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
44 for i in range(num_rows): |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
45 distance_matrix.append(linear_distance_matrix[min:max].tolist()) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
46 min = max |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
47 max = max + (i + 2) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
48 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
49 distance_matrix = DistanceMatrix(names=sample_names, matrix=distance_matrix) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
50 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
51 return distance_matrix |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
52 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
53 def main(): |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
54 parser = argparse.ArgumentParser() |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
55 parser.add_argument("--input", dest="input", help="") |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
56 args = parser.parse_args() |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
57 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
58 reader = csv.reader(open(args.input, "r"), delimiter="\t") |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
59 input_matrix = list(reader) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
60 # Don't build a tree with fewer than 3 samples, just produce an empty file |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
61 if len(input_matrix) < 4: |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
62 print('();') |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
63 sys.exit(0) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
64 distance_matrix = process_input_matrix(input_matrix) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
65 distance_matrix.format_phylip(sys.stdout) |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
66 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
67 |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
68 if __name__ == '__main__': |
|
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff
changeset
|
69 main() |
