Mercurial > repos > dfornika > mob_suite
changeset 19:115b462224cf draft
planemo upload for repository https://github.com/phac-nml/mob-suite commit 8898f2229ec13917b7d96e20725f3871d9d93e90-dirty
author | dfornika |
---|---|
date | Fri, 28 Jun 2019 22:17:09 -0400 |
parents | dce4f8d7b19f |
children | 80feea4c7a68 |
files | distance_matrix_phylip.py distance_matrix_phylip.xml test-data/mash_dist_matrix.phy test-data/mash_dist_matrix.txt |
diffstat | 4 files changed, 116 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distance_matrix_phylip.py Fri Jun 28 22:17:09 2019 -0400 @@ -0,0 +1,69 @@ +#!/usr/bin/env python + +import argparse +import sys +import csv +import numpy as np + +from Bio.Phylo.TreeConstruction import DistanceMatrix, DistanceTreeConstructor + + +def process_input_matrix(input_matrix): + """ Converts an array-of-arrays containting sample IDs and distances + into a BioPython DistanceMatrix object + """ + input_matrix.pop(0) + sample_names = [row[0] for row in input_matrix] + for row in input_matrix: + row.pop(0) + distance_matrix = [] + for input_matrix_row in input_matrix: + distance_matrix.append([float(i) for i in input_matrix_row]) + """ np.tril() converts a matrix like this: [[0 1 2] + [1 0 1] + [2 1 0]] + ...into this: [[0 0 0] + [1 0 0] + [2 1 0]] + ...but what we need to pass to DistanceMatrix() is this: [[0] + [1 0] + [2 1 0]] + ...so that's what the (somewhat cryptic) code below does. + """ + distance_matrix = np.tril(np.array(distance_matrix)) + num_rows = distance_matrix.shape[0] + """ masking the distance matrix with tril_indices gives a linearized + distance matrix [0 1 0 2 1 0] that we need to re-construct + into [[0], [1, 0], [2, 1, 0]] + """ + lower_triangular_idx_mask = np.tril_indices(num_rows) + linear_distance_matrix = distance_matrix[lower_triangular_idx_mask] + distance_matrix = [] + min = 0 + max = 1 + for i in range(num_rows): + distance_matrix.append(linear_distance_matrix[min:max].tolist()) + min = max + max = max + (i + 2) + + distance_matrix = DistanceMatrix(names=sample_names, matrix=distance_matrix) + + return distance_matrix + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", dest="input", help="") + args = parser.parse_args() + + reader = csv.reader(open(args.input, "r"), delimiter="\t") + input_matrix = list(reader) + # Don't build a tree with fewer than 3 samples, just produce an empty file + if len(input_matrix) < 4: + print('();') + sys.exit(0) + distance_matrix = process_input_matrix(input_matrix) + distance_matrix.format_phylip(sys.stdout) + + +if __name__ == '__main__': + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/distance_matrix_phylip.xml Fri Jun 28 22:17:09 2019 -0400 @@ -0,0 +1,29 @@ +<tool id="distance_matrix_phylip" name="Distance Matrix to Phylip" version="0.1.0"> + <description></description> + <requirements> + <requirement type="package" version="1.73">biopython</requirement> + </requirements> + <command detect_errors="exit_code"> + <![CDATA[ + $__tool_directory__/distance_matrix_phylip.py + --input '${input}' + > '${output}' + ]]> + </command> + <inputs> + <param name="input" type="data" format="tabular" label="Input" help=""/> + </inputs> + <outputs> + <data name="output" format="phylip"/> + </outputs> + <tests> + <test> + <param name="input" value="mash_dist_matrix.txt"/> + <output name="output" file="mash_dist_matrix.phy"/> + </test> + </tests> + <help> + </help> + <citations> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mash_dist_matrix.phy Fri Jun 28 22:17:09 2019 -0400 @@ -0,0 +1,9 @@ + 8 +A 0.0000 0.0135 0.0119 0.0391 0.0439 0.0111 0.0029 0.0000 +B 0.0135 0.0000 0.0070 0.0348 0.0563 0.0081 0.0152 0.0136 +C 0.0119 0.0070 0.0000 0.0318 0.0591 0.0013 0.0149 0.0119 +D 0.0391 0.0348 0.0318 0.0000 0.0550 0.0307 0.0407 0.0392 +E 0.0439 0.0563 0.0591 0.0550 0.0000 0.0574 0.0384 0.0441 +F 0.0111 0.0081 0.0013 0.0307 0.0574 0.0000 0.0140 0.0111 +G 0.0029 0.0152 0.0149 0.0407 0.0384 0.0140 0.0000 0.0030 +H 0.0000 0.0136 0.0119 0.0392 0.0441 0.0111 0.0030 0.0000
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/mash_dist_matrix.txt Fri Jun 28 22:17:09 2019 -0400 @@ -0,0 +1,9 @@ +#query A B C D E F G H +A 0 0.0135016 0.0119383 0.0391011 0.0439391 0.0110882 0.00294054 2.38274E-05 +B 0.0135016 0 0.00698112 0.0348125 0.0563082 0.00808024 0.0152404 0.0135508 +C 0.0119383 0.00698112 0 0.0317837 0.0590885 0.00134007 0.014871 0.0119383 +D 0.0391011 0.0348125 0.0317837 0 0.0549966 0.030677 0.040724 0.0392331 +E 0.0439391 0.0563082 0.0590885 0.0549966 0 0.0574401 0.0384495 0.0440934 +F 0.0110882 0.00808024 0.00134007 0.030677 0.0574401 0 0.0139988 0.0111321 +G 0.00294054 0.0152404 0.014871 0.040724 0.0384495 0.0139988 0 0.00296902 +H 2.38274E-05 0.0135508 0.0119383 0.0392331 0.0440934 0.0111321 0.00296902 0