annotate distance_matrix_phylip.py @ 38:17a60dd45b31 draft

"planemo upload for repository https://github.com/phac-nml/mob-suite commit 608abbed8881523f97c0378e350f32243a754237-dirty"
author dfornika
date Wed, 30 Oct 2019 23:38:12 -0400
parents 115b462224cf
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
19
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
1 #!/usr/bin/env python
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
2
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
3 import argparse
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
4 import sys
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
5 import csv
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
6 import numpy as np
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
7
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
8 from Bio.Phylo.TreeConstruction import DistanceMatrix, DistanceTreeConstructor
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
9
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
10
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
11 def process_input_matrix(input_matrix):
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
12 """ Converts an array-of-arrays containting sample IDs and distances
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
13 into a BioPython DistanceMatrix object
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
14 """
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
15 input_matrix.pop(0)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
16 sample_names = [row[0] for row in input_matrix]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
17 for row in input_matrix:
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
18 row.pop(0)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
19 distance_matrix = []
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
20 for input_matrix_row in input_matrix:
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
21 distance_matrix.append([float(i) for i in input_matrix_row])
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
22 """ np.tril() converts a matrix like this: [[0 1 2]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
23 [1 0 1]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
24 [2 1 0]]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
25 ...into this: [[0 0 0]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
26 [1 0 0]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
27 [2 1 0]]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
28 ...but what we need to pass to DistanceMatrix() is this: [[0]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
29 [1 0]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
30 [2 1 0]]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
31 ...so that's what the (somewhat cryptic) code below does.
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
32 """
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
33 distance_matrix = np.tril(np.array(distance_matrix))
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
34 num_rows = distance_matrix.shape[0]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
35 """ masking the distance matrix with tril_indices gives a linearized
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
36 distance matrix [0 1 0 2 1 0] that we need to re-construct
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
37 into [[0], [1, 0], [2, 1, 0]]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
38 """
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
39 lower_triangular_idx_mask = np.tril_indices(num_rows)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
40 linear_distance_matrix = distance_matrix[lower_triangular_idx_mask]
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
41 distance_matrix = []
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
42 min = 0
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
43 max = 1
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
44 for i in range(num_rows):
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
45 distance_matrix.append(linear_distance_matrix[min:max].tolist())
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
46 min = max
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
47 max = max + (i + 2)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
48
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
49 distance_matrix = DistanceMatrix(names=sample_names, matrix=distance_matrix)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
50
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
51 return distance_matrix
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
52
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
53 def main():
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
54 parser = argparse.ArgumentParser()
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
55 parser.add_argument("--input", dest="input", help="")
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
56 args = parser.parse_args()
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
57
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
58 reader = csv.reader(open(args.input, "r"), delimiter="\t")
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
59 input_matrix = list(reader)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
60 # Don't build a tree with fewer than 3 samples, just produce an empty file
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
61 if len(input_matrix) < 4:
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
62 print('();')
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
63 sys.exit(0)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
64 distance_matrix = process_input_matrix(input_matrix)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
65 distance_matrix.format_phylip(sys.stdout)
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
66
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
67
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
68 if __name__ == '__main__':
115b462224cf planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
dfornika
parents:
diff changeset
69 main()