Mercurial > repos > bgruening > sucos_clustering
comparison sucos.py @ 0:c0e3a335dbfc draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
| author | bgruening |
|---|---|
| date | Wed, 02 Oct 2019 13:00:09 -0400 |
| parents | |
| children | dbfcc048cbbc |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c0e3a335dbfc |
|---|---|
| 1 #!/usr/bin/env python | |
| 2 """ | |
| 3 Basic SuCOS scoring. Allows a set of molecules from a SD file to be overlayed to a reference molecule, | |
| 4 with the resulting scores being written as properties in the output SD file. | |
| 5 | |
| 6 SuCOS is the work of Susan Leung. | |
| 7 GitHub: https://github.com/susanhleung/SuCOS | |
| 8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | |
| 9 """ | |
| 10 | |
| 11 from __future__ import print_function | |
| 12 import argparse, os, sys, gzip | |
| 13 import numpy as np | |
| 14 from rdkit import Chem, rdBase, RDConfig | |
| 15 from rdkit.Chem import AllChem, rdShapeHelpers | |
| 16 from rdkit.Chem.FeatMaps import FeatMaps | |
| 17 import utils | |
| 18 | |
| 19 | |
| 20 ### start function definitions ######################################### | |
| 21 | |
| 22 # Setting up the features to use in FeatureMap | |
| 23 fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) | |
| 24 | |
| 25 fmParams = {} | |
| 26 for k in fdef.GetFeatureFamilies(): | |
| 27 fparams = FeatMaps.FeatMapParams() | |
| 28 fmParams[k] = fparams | |
| 29 | |
| 30 keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder', | |
| 31 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe') | |
| 32 | |
| 33 def filterFeature(f): | |
| 34 result = f.GetFamily() in keep | |
| 35 # TODO - nothing ever seems to be filtered. Is this expected? | |
| 36 if not result: | |
| 37 utils.log("Filtered out feature type", f.GetFamily()) | |
| 38 return result | |
| 39 | |
| 40 def getRawFeatures(mol): | |
| 41 | |
| 42 rawFeats = fdef.GetFeaturesForMol(mol) | |
| 43 # filter that list down to only include the ones we're interested in | |
| 44 filtered = list(filter(filterFeature, rawFeats)) | |
| 45 return filtered | |
| 46 | |
| 47 def get_FeatureMapScore(small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | |
| 48 """ | |
| 49 Generate the feature map score. | |
| 50 | |
| 51 :param small_feats: | |
| 52 :param large_feats: | |
| 53 :param tani: | |
| 54 :return: | |
| 55 """ | |
| 56 | |
| 57 featLists = [] | |
| 58 for rawFeats in [small_feats, large_feats]: | |
| 59 # filter that list down to only include the ones we're interested in | |
| 60 featLists.append(rawFeats) | |
| 61 fms = [FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) for x in featLists] | |
| 62 # set the score mode | |
| 63 fms[0].scoreMode = score_mode | |
| 64 | |
| 65 try: | |
| 66 if tani: | |
| 67 c = fms[0].ScoreFeats(featLists[1]) | |
| 68 A = fms[0].GetNumFeatures() | |
| 69 B = len(featLists[1]) | |
| 70 if B != fms[1].GetNumFeatures(): | |
| 71 utils.log("Why isn't B equal to number of features...?!") | |
| 72 tani_score = float(c) / (A+B-c) | |
| 73 return tani_score | |
| 74 else: | |
| 75 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | |
| 76 return fm_score | |
| 77 except ZeroDivisionError: | |
| 78 utils.log("ZeroDivisionError") | |
| 79 return 0 | |
| 80 | |
| 81 if tani: | |
| 82 tani_score = float(c) / (A+B-c) | |
| 83 return tani_score | |
| 84 else: | |
| 85 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | |
| 86 return fm_score | |
| 87 | |
| 88 | |
| 89 def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_features=None, score_mode=FeatMaps.FeatMapScoreMode.All): | |
| 90 """ | |
| 91 This is the key function that calculates the SuCOS scores and is expected to be called from other modules. | |
| 92 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having | |
| 93 to recalculate them. Use the getRawFeatures function to pre-calculate the features. | |
| 94 | |
| 95 :param ref_mol: The reference molecule to compare to | |
| 96 :param query_mol: The molecule to align to the reference | |
| 97 :param tani: Whether to calculate Tanimoto distances | |
| 98 :param ref_features: An optional feature map for the reference molecule, avoiding the need to re-calculate it. | |
| 99 :param query_features: An optional feature map for the query molecule, avoiding the need to re-calculate it. | |
| 100 :return: A tuple of 3 values. 1 the sucos score, 2 the feature map score, | |
| 101 3 the Tanimoto distance or 1 minus the protrude distance | |
| 102 """ | |
| 103 | |
| 104 if not ref_features: | |
| 105 ref_features = getRawFeatures(ref_mol) | |
| 106 if not query_features: | |
| 107 query_features = getRawFeatures(query_mol) | |
| 108 | |
| 109 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) | |
| 110 fm_score = np.clip(fm_score, 0, 1) | |
| 111 | |
| 112 if tani: | |
| 113 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) | |
| 114 tani_sim = np.clip(tani_sim, 0, 1) | |
| 115 SuCOS_score = 0.5*fm_score + 0.5*tani_sim | |
| 116 return SuCOS_score, fm_score, tani_sim | |
| 117 else: | |
| 118 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False) | |
| 119 protrude_dist = np.clip(protrude_dist, 0, 1) | |
| 120 protrude_val = 1.0 - protrude_dist | |
| 121 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val | |
| 122 return SuCOS_score, fm_score, protrude_val | |
| 123 | |
| 124 def process(refmol_filename, inputs_filename, outputs_filename, refmol_index=None, | |
| 125 refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | |
| 126 | |
| 127 ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) | |
| 128 #utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") | |
| 129 ref_features = getRawFeatures(ref_mol) | |
| 130 | |
| 131 input_file = utils.open_file_for_reading(inputs_filename) | |
| 132 suppl = Chem.ForwardSDMolSupplier(input_file) | |
| 133 output_file = utils.open_file_for_writing(outputs_filename) | |
| 134 writer = Chem.SDWriter(output_file) | |
| 135 | |
| 136 count = 0 | |
| 137 total = 0 | |
| 138 errors = 0 | |
| 139 for mol in suppl: | |
| 140 count +=1 | |
| 141 if mol is None: | |
| 142 continue | |
| 143 #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") | |
| 144 try: | |
| 145 sucos_score, fm_score, val3 = get_SucosScore(ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode) | |
| 146 mol.SetDoubleProp("SuCOS_Score", sucos_score) | |
| 147 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) | |
| 148 if tani: | |
| 149 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) | |
| 150 else: | |
| 151 mol.SetDoubleProp("SuCOS_Protrude_Score", val3) | |
| 152 utils.log("Scores:", sucos_score, fm_score, val3) | |
| 153 writer.write(mol) | |
| 154 total +=1 | |
| 155 except ValueError as e: | |
| 156 errors +=1 | |
| 157 utils.log("Molecule", count, "failed to score:", e.message) | |
| 158 | |
| 159 input_file.close() | |
| 160 writer.flush() | |
| 161 writer.close() | |
| 162 output_file.close() | |
| 163 | |
| 164 utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors") | |
| 165 | |
| 166 def parse_score_mode(value): | |
| 167 if value == None or value == 'all': | |
| 168 return FeatMaps.FeatMapScoreMode.All | |
| 169 elif value == 'closest': | |
| 170 return FeatMaps.FeatMapScoreMode.Closest | |
| 171 elif value == 'best': | |
| 172 return FeatMaps.FeatMapScoreMode.Best | |
| 173 else: | |
| 174 raise ValueError(value + " is not a valid scoring mode option") | |
| 175 | |
| 176 | |
| 177 ### start main execution ######################################### | |
| 178 | |
| 179 def main(): | |
| 180 | |
| 181 parser = argparse.ArgumentParser(description='SuCOS with RDKit') | |
| 182 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') | |
| 183 parser.add_argument('-r', '--refmol', help='Molecule to compare against in Molfile (.mol) or SDF (.sdf) format') | |
| 184 parser.add_argument('--refmol-format', help="Format for the reference molecule (mol or sdf). " + | |
| 185 "Only needed if files don't have the expected extensions") | |
| 186 parser.add_argument('--refmolidx', help='Reference molecule index in SD file if not the first', type=int, default=1) | |
| 187 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') | |
| 188 parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score') | |
| 189 parser.add_argument('--score_mode', choices=['all', 'closest', 'best'], | |
| 190 help="choose the scoring mode for the feature map, default is 'all'.") | |
| 191 | |
| 192 args = parser.parse_args() | |
| 193 utils.log("SuCOS Args: ", args) | |
| 194 | |
| 195 score_mode = parse_score_mode(args.score_mode) | |
| 196 | |
| 197 process(args.refmol, args.input, args.output, refmol_index=args.refmolidx, | |
| 198 refmol_format=args.refmol_format, tani=args.tanimoto, score_mode=score_mode) | |
| 199 | |
| 200 | |
| 201 if __name__ == "__main__": | |
| 202 main() |
