changeset 19:115b462224cf draft

planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
author dfornika
date Fri, 28 Jun 2019 22:17:09 -0400
parents dce4f8d7b19f
children 80feea4c7a68
files distance_matrix_phylip.py distance_matrix_phylip.xml test-data/mash_dist_matrix.phy test-data/mash_dist_matrix.txt
diffstat 4 files changed, 116 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/distance_matrix_phylip.py	Fri Jun 28 22:17:09 2019 -0400
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+import csv
+import numpy as np
+
+from Bio.Phylo.TreeConstruction import DistanceMatrix, DistanceTreeConstructor
+
+
+def process_input_matrix(input_matrix):
+    """ Converts an array-of-arrays containting sample IDs and distances
+        into a BioPython DistanceMatrix object
+    """
+    input_matrix.pop(0)
+    sample_names = [row[0] for row in input_matrix]
+    for row in input_matrix:
+        row.pop(0)
+    distance_matrix = []
+    for input_matrix_row in input_matrix:
+        distance_matrix.append([float(i) for i in input_matrix_row])
+    """ np.tril() converts a matrix like this: [[0 1 2]
+                                                [1 0 1]
+                                                [2 1 0]]
+        ...into this: [[0 0 0]
+                       [1 0 0]
+                       [2 1 0]]
+        ...but what we need to pass to DistanceMatrix() is this: [[0]
+                                                                  [1 0]
+                                                                  [2 1 0]]
+        ...so that's what the (somewhat cryptic) code below does.
+    """
+    distance_matrix = np.tril(np.array(distance_matrix))
+    num_rows = distance_matrix.shape[0]
+    """ masking the distance matrix with tril_indices gives a linearized
+        distance matrix [0 1 0 2 1 0] that we need to re-construct 
+        into [[0], [1, 0], [2, 1, 0]]
+    """
+    lower_triangular_idx_mask = np.tril_indices(num_rows)
+    linear_distance_matrix = distance_matrix[lower_triangular_idx_mask]
+    distance_matrix = []
+    min = 0
+    max = 1
+    for i in range(num_rows):
+        distance_matrix.append(linear_distance_matrix[min:max].tolist())
+        min = max
+        max = max + (i + 2) 
+
+    distance_matrix = DistanceMatrix(names=sample_names, matrix=distance_matrix)
+
+    return distance_matrix
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", dest="input", help="")
+    args = parser.parse_args()
+
+    reader = csv.reader(open(args.input, "r"), delimiter="\t")
+    input_matrix = list(reader)
+    # Don't build a tree with fewer than 3 samples, just produce an empty file
+    if len(input_matrix) < 4:
+      print('();')
+      sys.exit(0)
+    distance_matrix = process_input_matrix(input_matrix)
+    distance_matrix.format_phylip(sys.stdout)
+
+
+if __name__ == '__main__':
+    main()
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/distance_matrix_phylip.xml	Fri Jun 28 22:17:09 2019 -0400
@@ -0,0 +1,29 @@
+<tool id="distance_matrix_phylip" name="Distance Matrix to Phylip" version="0.1.0">
+  <description></description>
+  <requirements>
+     <requirement type="package" version="1.73">biopython</requirement>
+  </requirements>   
+  <command detect_errors="exit_code">
+  <![CDATA[  
+    $__tool_directory__/distance_matrix_phylip.py  
+    --input '${input}'
+    > '${output}'
+  ]]>  
+  </command>
+  <inputs>
+    <param name="input" type="data" format="tabular" label="Input" help=""/>
+  </inputs>
+  <outputs>
+    <data name="output" format="phylip"/> 
+  </outputs>
+  <tests>
+    <test>
+        <param name="input" value="mash_dist_matrix.txt"/>
+        <output name="output" file="mash_dist_matrix.phy"/>
+    </test>
+  </tests>
+  <help>
+  </help>
+  <citations>
+  </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mash_dist_matrix.phy	Fri Jun 28 22:17:09 2019 -0400
@@ -0,0 +1,9 @@
+    8
+A           0.0000  0.0135  0.0119  0.0391  0.0439  0.0111  0.0029  0.0000
+B           0.0135  0.0000  0.0070  0.0348  0.0563  0.0081  0.0152  0.0136
+C           0.0119  0.0070  0.0000  0.0318  0.0591  0.0013  0.0149  0.0119
+D           0.0391  0.0348  0.0318  0.0000  0.0550  0.0307  0.0407  0.0392
+E           0.0439  0.0563  0.0591  0.0550  0.0000  0.0574  0.0384  0.0441
+F           0.0111  0.0081  0.0013  0.0307  0.0574  0.0000  0.0140  0.0111
+G           0.0029  0.0152  0.0149  0.0407  0.0384  0.0140  0.0000  0.0030
+H           0.0000  0.0136  0.0119  0.0392  0.0441  0.0111  0.0030  0.0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mash_dist_matrix.txt	Fri Jun 28 22:17:09 2019 -0400
@@ -0,0 +1,9 @@
+#query	A	B	C	D	E	F	G	H
+A	0	0.0135016	0.0119383	0.0391011	0.0439391	0.0110882	0.00294054	2.38274E-05
+B	0.0135016	0	0.00698112	0.0348125	0.0563082	0.00808024	0.0152404	0.0135508
+C	0.0119383	0.00698112	0	0.0317837	0.0590885	0.00134007	0.014871	0.0119383
+D	0.0391011	0.0348125	0.0317837	0	0.0549966	0.030677	0.040724	0.0392331
+E	0.0439391	0.0563082	0.0590885	0.0549966	0	0.0574401	0.0384495	0.0440934
+F	0.0110882	0.00808024	0.00134007	0.030677	0.0574401	0	0.0139988	0.0111321
+G	0.00294054	0.0152404	0.014871	0.040724	0.0384495	0.0139988	0	0.00296902
+H	2.38274E-05	0.0135508	0.0119383	0.0392331	0.0440934	0.0111321	0.00296902	0