Mercurial > repos > bgruening > sucos_clustering
comparison sucos_cluster.py @ 0:c0e3a335dbfc draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
| author | bgruening |
|---|---|
| date | Wed, 02 Oct 2019 13:00:09 -0400 |
| parents | |
| children | ba1a2eba3f3d |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c0e3a335dbfc |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Cluster a set of molecules based on their 3D overlays as determined by the SuCOS score. | |
| 4 | |
| 5 This will generate a set of SD files, one for each cluster of molecules (presumably corresponding to a | |
| 6 binding pocket in the protein target). | |
| 7 | |
| 8 | |
| 9 SuCOS is the work of Susan Leung. | |
| 10 GitHub: https://github.com/susanhleung/SuCOS | |
| 11 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | |
| 12 """ | |
| 13 | |
| 14 import sucos, utils | |
| 15 import argparse, gzip | |
| 16 from rdkit import Chem | |
| 17 import numpy as np | |
| 18 import pandas as pd | |
| 19 from scipy.cluster.hierarchy import linkage, fcluster | |
| 20 | |
| 21 ### start main execution ######################################### | |
| 22 | |
| 23 | |
| 24 def calc_distance_matrix(mols): | |
| 25 """ | |
| 26 Calculate a full distance matrix for the given molecules. Identical molecules get a score of 0.0 with the maximum | |
| 27 distance possible being 1.0. | |
| 28 :param mols: A list of molecules. It must be possible to iterate through this list multiple times | |
| 29 :return: A NxN 2D array of distance scores, with N being the number of molecules in the input | |
| 30 """ | |
| 31 | |
| 32 # TODO - do we need to calculate both sides of the matrix? Tanimoto is supposed to be a symmetric distance measure, | |
| 33 # but the matrix that is generated does not seem to be symmetric. | |
| 34 | |
| 35 mol_fm_tuples = [] | |
| 36 for mol in mols: | |
| 37 features = sucos.getRawFeatures(mol) | |
| 38 mol_fm_tuples.append((mol, features)) | |
| 39 | |
| 40 matrix = [] | |
| 41 for tuple1 in mol_fm_tuples: | |
| 42 tmp = [] | |
| 43 for tuple2 in mol_fm_tuples: | |
| 44 if tuple1[0] == tuple2[0]: | |
| 45 tmp.append(0.0) | |
| 46 else: | |
| 47 #utils.log("Calculating SuCOS between", mol1, mol2) | |
| 48 sucos_score, fm_score, tani_score = sucos.get_SucosScore(tuple1[0], tuple2[0], | |
| 49 tani=True, ref_features=tuple1[1], query_features=tuple2[1]) | |
| 50 tmp.append(1.0 - sucos_score) | |
| 51 matrix.append(tmp) | |
| 52 | |
| 53 | |
| 54 return matrix | |
| 55 | |
| 56 | |
| 57 def cluster(matrix, threshold=0.8): | |
| 58 """ | |
| 59 Cluster the supplied distance matrix returning an array of clusters. | |
| 60 :param matrix: the distance matrix, as calculated with the calc_distance_matrix function. | |
| 61 :param threshold: The clustering cuttoff. The default of 0.8 is a reasonable value to use. | |
| 62 :return: An array of clusters, each cluster being an array of the indices from the matrix. | |
| 63 """ | |
| 64 | |
| 65 indexes = [x for x in range(0, len(matrix))] | |
| 66 cols = [x for x in range(0, len(matrix[0]))] | |
| 67 #utils.log("indexes", indexes) | |
| 68 #utils.log("cols", cols) | |
| 69 df = pd.DataFrame(matrix, columns=cols, index=indexes) | |
| 70 utils.log("DataFrame:", df.shape) | |
| 71 #utils.log(df) | |
| 72 indices = np.triu_indices(df.shape[0], k=1) | |
| 73 #utils.log("Indices:", indices) | |
| 74 t = np.array(df)[indices] | |
| 75 Z = linkage(t, 'average') | |
| 76 lig_clusters = [] | |
| 77 cluster_arr = fcluster(Z, t=threshold, criterion='distance') | |
| 78 for i in range(np.amax(cluster_arr)): | |
| 79 clus = df.columns[np.argwhere(cluster_arr==i+1)] | |
| 80 lig_clusters.append([x[0] for x in clus.tolist()]) | |
| 81 | |
| 82 utils.log("Clusters", lig_clusters) | |
| 83 return lig_clusters | |
| 84 | |
| 85 def write_clusters_to_sdfs(mols, clusters, basename, gzip=False): | |
| 86 """ | |
| 87 Write the molecules to SDF files, 1 file for each cluster. | |
| 88 :param mols The molecules to write: | |
| 89 :param clusters The clusters, as returned by the cluster function: | |
| 90 :param basename The basename for the file name. e.g. if basename is 'output' then files like | |
| 91 output1.sdf, output2.sdf will be written: | |
| 92 :param gzip Whether to gzip the output | |
| 93 :return: | |
| 94 """ | |
| 95 | |
| 96 i = 0 | |
| 97 for cluster in clusters: | |
| 98 i += 1 | |
| 99 filename = basename + str(i) + ".sdf" | |
| 100 if gzip: | |
| 101 filename += ".gz" | |
| 102 utils.log("Writing ", len(cluster), "molecules in cluster", i, "to file", filename) | |
| 103 output_file = utils.open_file_for_writing(filename) | |
| 104 writer = Chem.SDWriter(output_file) | |
| 105 for index in cluster: | |
| 106 mol = mols[index] | |
| 107 writer.write(mol) | |
| 108 writer.flush() | |
| 109 writer.close() | |
| 110 output_file.close() | |
| 111 | |
| 112 | |
| 113 | |
| 114 def main(): | |
| 115 parser = argparse.ArgumentParser(description='Clustering with SuCOS and RDKit') | |
| 116 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') | |
| 117 parser.add_argument('-o', '--output', default="cluster", help="Base name for output files in SDF format. " + | |
| 118 "e.g. if value is 'output' then files like output1.sdf, output2.sdf will be created") | |
| 119 parser.add_argument('--gzip', action='store_true', help='Gzip the outputs generating files like output1.sdf.gz, output2.sdf.gz') | |
| 120 parser.add_argument('-t', '--threshold', type=float, default=0.8, help='Clustering threshold') | |
| 121 | |
| 122 args = parser.parse_args() | |
| 123 utils.log("SuCOS Cluster Args: ", args) | |
| 124 | |
| 125 input_file = utils.open_file_for_reading(args.input) | |
| 126 suppl = Chem.ForwardSDMolSupplier(input_file) | |
| 127 mols = list(suppl) | |
| 128 matrix = calc_distance_matrix(mols) | |
| 129 clusters = cluster(matrix, threshold=args.threshold) | |
| 130 write_clusters_to_sdfs(mols, clusters, args.output, gzip=args.gzip) | |
| 131 | |
| 132 | |
| 133 if __name__ == "__main__": | |
| 134 main() |
