Mercurial > repos > dfornika > mob_suite
annotate distance_matrix_phylip.py @ 22:41acae990b9b draft
planemo upload for repository https://github.com/phac-nml/mob-suite commit acb6fcb33c5a021b8e1f9946e28d443f6a494b1a-dirty
| author | dfornika | 
|---|---|
| date | Mon, 08 Jul 2019 20:02:27 -0400 | 
| parents | 115b462224cf | 
| children | 
| rev | line source | 
|---|---|
| 
19
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
1 #!/usr/bin/env python | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
2 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
3 import argparse | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
4 import sys | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
5 import csv | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
6 import numpy as np | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
7 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
8 from Bio.Phylo.TreeConstruction import DistanceMatrix, DistanceTreeConstructor | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
9 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
10 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
11 def process_input_matrix(input_matrix): | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
12 """ Converts an array-of-arrays containting sample IDs and distances | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
13 into a BioPython DistanceMatrix object | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
14 """ | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
15 input_matrix.pop(0) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
16 sample_names = [row[0] for row in input_matrix] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
17 for row in input_matrix: | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
18 row.pop(0) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
19 distance_matrix = [] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
20 for input_matrix_row in input_matrix: | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
21 distance_matrix.append([float(i) for i in input_matrix_row]) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
22 """ np.tril() converts a matrix like this: [[0 1 2] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
23 [1 0 1] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
24 [2 1 0]] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
25 ...into this: [[0 0 0] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
26 [1 0 0] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
27 [2 1 0]] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
28 ...but what we need to pass to DistanceMatrix() is this: [[0] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
29 [1 0] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
30 [2 1 0]] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
31 ...so that's what the (somewhat cryptic) code below does. | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
32 """ | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
33 distance_matrix = np.tril(np.array(distance_matrix)) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
34 num_rows = distance_matrix.shape[0] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
35 """ masking the distance matrix with tril_indices gives a linearized | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
36 distance matrix [0 1 0 2 1 0] that we need to re-construct | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
37 into [[0], [1, 0], [2, 1, 0]] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
38 """ | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
39 lower_triangular_idx_mask = np.tril_indices(num_rows) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
40 linear_distance_matrix = distance_matrix[lower_triangular_idx_mask] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
41 distance_matrix = [] | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
42 min = 0 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
43 max = 1 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
44 for i in range(num_rows): | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
45 distance_matrix.append(linear_distance_matrix[min:max].tolist()) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
46 min = max | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
47 max = max + (i + 2) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
48 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
49 distance_matrix = DistanceMatrix(names=sample_names, matrix=distance_matrix) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
50 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
51 return distance_matrix | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
52 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
53 def main(): | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
54 parser = argparse.ArgumentParser() | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
55 parser.add_argument("--input", dest="input", help="") | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
56 args = parser.parse_args() | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
57 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
58 reader = csv.reader(open(args.input, "r"), delimiter="\t") | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
59 input_matrix = list(reader) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
60 # Don't build a tree with fewer than 3 samples, just produce an empty file | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
61 if len(input_matrix) < 4: | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
62 print('();') | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
63 sys.exit(0) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
64 distance_matrix = process_input_matrix(input_matrix) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
65 distance_matrix.format_phylip(sys.stdout) | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
66 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
67 | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
68 if __name__ == '__main__': | 
| 
 
115b462224cf
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
 
dfornika 
parents:  
diff
changeset
 | 
69 main() | 
