Mercurial > repos > bgruening > sucos_clustering
comparison sucos.py @ 6:ba1a2eba3f3d draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
| author | bgruening |
|---|---|
| date | Wed, 14 Apr 2021 09:29:01 +0000 |
| parents | dbfcc048cbbc |
| children |
comparison
equal
deleted
inserted
replaced
| 5:aaa3f95add3e | 6:ba1a2eba3f3d |
|---|---|
| 7 GitHub: https://github.com/susanhleung/SuCOS | 7 GitHub: https://github.com/susanhleung/SuCOS |
| 8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | 8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 |
| 9 """ | 9 """ |
| 10 | 10 |
| 11 from __future__ import print_function | 11 from __future__ import print_function |
| 12 import argparse, os, sys, gzip | 12 |
| 13 import argparse | |
| 14 import os | |
| 15 | |
| 13 import numpy as np | 16 import numpy as np |
| 14 from rdkit import Chem, rdBase, RDConfig | 17 import utils |
| 18 from rdkit import Chem, RDConfig | |
| 15 from rdkit.Chem import AllChem, rdShapeHelpers | 19 from rdkit.Chem import AllChem, rdShapeHelpers |
| 16 from rdkit.Chem.FeatMaps import FeatMaps | 20 from rdkit.Chem.FeatMaps import FeatMaps |
| 17 import utils | 21 |
| 18 | 22 # start function definitions ######################################### |
| 19 | |
| 20 ### start function definitions ######################################### | |
| 21 | 23 |
| 22 # Setting up the features to use in FeatureMap | 24 # Setting up the features to use in FeatureMap |
| 23 fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) | 25 fdef = AllChem.BuildFeatureFactory( |
| 26 os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef") | |
| 27 ) | |
| 24 | 28 |
| 25 fmParams = {} | 29 fmParams = {} |
| 26 for k in fdef.GetFeatureFamilies(): | 30 for k in fdef.GetFeatureFamilies(): |
| 27 fparams = FeatMaps.FeatMapParams() | 31 fparams = FeatMaps.FeatMapParams() |
| 28 fmParams[k] = fparams | 32 fmParams[k] = fparams |
| 29 | 33 |
| 30 keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder', | 34 keep = ( |
| 31 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe') | 35 "Donor", |
| 36 "Acceptor", | |
| 37 "NegIonizable", | |
| 38 "PosIonizable", | |
| 39 "ZnBinder", | |
| 40 "Aromatic", | |
| 41 "Hydrophobe", | |
| 42 "LumpedHydrophobe", | |
| 43 ) | |
| 44 | |
| 32 | 45 |
| 33 def filterFeature(f): | 46 def filterFeature(f): |
| 34 result = f.GetFamily() in keep | 47 result = f.GetFamily() in keep |
| 35 # TODO - nothing ever seems to be filtered. Is this expected? | 48 # TODO - nothing ever seems to be filtered. Is this expected? |
| 36 if not result: | 49 if not result: |
| 37 utils.log("Filtered out feature type", f.GetFamily()) | 50 utils.log("Filtered out feature type", f.GetFamily()) |
| 38 return result | 51 return result |
| 39 | 52 |
| 53 | |
| 40 def getRawFeatures(mol): | 54 def getRawFeatures(mol): |
| 41 | 55 |
| 42 rawFeats = fdef.GetFeaturesForMol(mol) | 56 rawFeats = fdef.GetFeaturesForMol(mol) |
| 43 # filter that list down to only include the ones we're interested in | 57 # filter that list down to only include the ones we're interested in |
| 44 filtered = list(filter(filterFeature, rawFeats)) | 58 filtered = list(filter(filterFeature, rawFeats)) |
| 45 return filtered | 59 return filtered |
| 46 | 60 |
| 47 def get_FeatureMapScore(small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | 61 |
| 62 def get_FeatureMapScore( | |
| 63 small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All | |
| 64 ): | |
| 48 """ | 65 """ |
| 49 Generate the feature map score. | 66 Generate the feature map score. |
| 50 | 67 |
| 51 :param small_feats: | 68 :param small_feats: |
| 52 :param large_feats: | 69 :param large_feats: |
| 56 | 73 |
| 57 featLists = [] | 74 featLists = [] |
| 58 for rawFeats in [small_feats, large_feats]: | 75 for rawFeats in [small_feats, large_feats]: |
| 59 # filter that list down to only include the ones we're interested in | 76 # filter that list down to only include the ones we're interested in |
| 60 featLists.append(rawFeats) | 77 featLists.append(rawFeats) |
| 61 fms = [FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) for x in featLists] | 78 fms = [ |
| 79 FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) | |
| 80 for x in featLists | |
| 81 ] | |
| 62 # set the score mode | 82 # set the score mode |
| 63 fms[0].scoreMode = score_mode | 83 fms[0].scoreMode = score_mode |
| 64 | 84 |
| 65 try: | 85 try: |
| 66 if tani: | 86 if tani: |
| 67 c = fms[0].ScoreFeats(featLists[1]) | 87 c = fms[0].ScoreFeats(featLists[1]) |
| 68 A = fms[0].GetNumFeatures() | 88 A = fms[0].GetNumFeatures() |
| 69 B = len(featLists[1]) | 89 B = len(featLists[1]) |
| 70 if B != fms[1].GetNumFeatures(): | 90 if B != fms[1].GetNumFeatures(): |
| 71 utils.log("Why isn't B equal to number of features...?!") | 91 utils.log("Why isn't B equal to number of features...?!") |
| 72 tani_score = float(c) / (A+B-c) | 92 tani_score = float(c) / (A + B - c) |
| 73 return tani_score | 93 return tani_score |
| 74 else: | 94 else: |
| 75 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | 95 fm_score = fms[0].ScoreFeats(featLists[1]) / min( |
| 96 fms[0].GetNumFeatures(), len(featLists[1]) | |
| 97 ) | |
| 76 return fm_score | 98 return fm_score |
| 77 except ZeroDivisionError: | 99 except ZeroDivisionError: |
| 78 utils.log("ZeroDivisionError") | 100 utils.log("ZeroDivisionError") |
| 79 return 0 | 101 return 0 |
| 80 | 102 |
| 81 if tani: | 103 if tani: |
| 82 tani_score = float(c) / (A+B-c) | 104 tani_score = float(c) / (A + B - c) |
| 83 return tani_score | 105 return tani_score |
| 84 else: | 106 else: |
| 85 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | 107 fm_score = fms[0].ScoreFeats(featLists[1]) / min( |
| 108 fms[0].GetNumFeatures(), len(featLists[1]) | |
| 109 ) | |
| 86 return fm_score | 110 return fm_score |
| 87 | 111 |
| 88 | 112 |
| 89 def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_features=None, score_mode=FeatMaps.FeatMapScoreMode.All): | 113 def get_SucosScore( |
| 114 ref_mol, | |
| 115 query_mol, | |
| 116 tani=False, | |
| 117 ref_features=None, | |
| 118 query_features=None, | |
| 119 score_mode=FeatMaps.FeatMapScoreMode.All, | |
| 120 ): | |
| 90 """ | 121 """ |
| 91 This is the key function that calculates the SuCOS scores and is expected to be called from other modules. | 122 This is the key function that calculates the SuCOS scores and is expected to be called from other modules. |
| 92 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having | 123 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having |
| 93 to recalculate them. Use the getRawFeatures function to pre-calculate the features. | 124 to recalculate them. Use the getRawFeatures function to pre-calculate the features. |
| 94 | 125 |
| 105 ref_features = getRawFeatures(ref_mol) | 136 ref_features = getRawFeatures(ref_mol) |
| 106 if not query_features: | 137 if not query_features: |
| 107 query_features = getRawFeatures(query_mol) | 138 query_features = getRawFeatures(query_mol) |
| 108 | 139 |
| 109 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) | 140 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) |
| 110 fm_score = np.clip(fm_score, 0, 1) | 141 fm_score = np.float(np.clip(fm_score, 0, 1)) |
| 111 | 142 |
| 112 try : | 143 try: |
| 113 if tani: | 144 if tani: |
| 114 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) | 145 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) |
| 115 tani_sim = np.clip(tani_sim, 0, 1) | 146 tani_sim = np.clip(tani_sim, 0, 1) |
| 116 SuCOS_score = 0.5*fm_score + 0.5*tani_sim | 147 SuCOS_score = 0.5 * fm_score + 0.5 * tani_sim |
| 117 return SuCOS_score, fm_score, tani_sim | 148 return SuCOS_score, fm_score, tani_sim |
| 118 else: | 149 else: |
| 119 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False) | 150 protrude_dist = rdShapeHelpers.ShapeProtrudeDist( |
| 151 ref_mol, query_mol, allowReordering=False | |
| 152 ) | |
| 120 protrude_dist = np.clip(protrude_dist, 0, 1) | 153 protrude_dist = np.clip(protrude_dist, 0, 1) |
| 121 protrude_val = 1.0 - protrude_dist | 154 protrude_val = 1.0 - protrude_dist |
| 122 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val | 155 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val |
| 123 return SuCOS_score, fm_score, protrude_val | 156 return SuCOS_score, fm_score, protrude_val |
| 124 except: | 157 except Exception: |
| 125 utils.log("Failed to calculate SuCOS scores. Returning 0,0,0") | 158 utils.log("Failed to calculate SuCOS scores. Returning 0,0,0") |
| 126 return 0, 0, 0 | 159 return 0, 0, 0 |
| 127 | 160 |
| 128 def process(refmol_filename, inputs_filename, outputs_filename, refmol_index=None, | 161 |
| 129 refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | 162 def process( |
| 130 | 163 refmol_filename, |
| 131 ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) | 164 inputs_filename, |
| 132 #utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") | 165 outputs_filename, |
| 166 refmol_index=None, | |
| 167 refmol_format=None, | |
| 168 tani=False, | |
| 169 score_mode=FeatMaps.FeatMapScoreMode.All, | |
| 170 ): | |
| 171 | |
| 172 ref_mol = utils.read_single_molecule( | |
| 173 refmol_filename, index=refmol_index, format=refmol_format | |
| 174 ) | |
| 175 # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") | |
| 133 ref_features = getRawFeatures(ref_mol) | 176 ref_features = getRawFeatures(ref_mol) |
| 134 | 177 |
| 135 input_file = utils.open_file_for_reading(inputs_filename) | 178 input_file = utils.open_file_for_reading(inputs_filename) |
| 136 suppl = Chem.ForwardSDMolSupplier(input_file) | 179 suppl = Chem.ForwardSDMolSupplier(input_file) |
| 137 output_file = utils.open_file_for_writing(outputs_filename) | 180 output_file = utils.open_file_for_writing(outputs_filename) |
| 139 | 182 |
| 140 count = 0 | 183 count = 0 |
| 141 total = 0 | 184 total = 0 |
| 142 errors = 0 | 185 errors = 0 |
| 143 for mol in suppl: | 186 for mol in suppl: |
| 144 count +=1 | 187 count += 1 |
| 145 if mol is None: | 188 if mol is None: |
| 146 continue | 189 continue |
| 147 #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") | 190 # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") |
| 148 try: | 191 try: |
| 149 sucos_score, fm_score, val3 = get_SucosScore(ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode) | 192 sucos_score, fm_score, val3 = get_SucosScore( |
| 193 ref_mol, | |
| 194 mol, | |
| 195 tani=tani, | |
| 196 ref_features=ref_features, | |
| 197 score_mode=score_mode, | |
| 198 ) | |
| 150 mol.SetDoubleProp("SuCOS_Score", sucos_score) | 199 mol.SetDoubleProp("SuCOS_Score", sucos_score) |
| 151 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) | 200 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) |
| 152 if tani: | 201 if tani: |
| 153 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) | 202 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) |
| 154 else: | 203 else: |
| 155 mol.SetDoubleProp("SuCOS_Protrude_Score", val3) | 204 mol.SetDoubleProp("SuCOS_Protrude_Score", val3) |
| 156 utils.log("Scores:", sucos_score, fm_score, val3) | 205 utils.log("Scores:", sucos_score, fm_score, val3) |
| 157 writer.write(mol) | 206 writer.write(mol) |
| 158 total +=1 | 207 total += 1 |
| 159 except ValueError as e: | 208 except ValueError as e: |
| 160 errors +=1 | 209 errors += 1 |
| 161 utils.log("Molecule", count, "failed to score:", e.message) | 210 utils.log("Molecule", count, "failed to score:", e.message) |
| 162 | 211 |
| 163 input_file.close() | 212 input_file.close() |
| 164 writer.flush() | 213 writer.flush() |
| 165 writer.close() | 214 writer.close() |
| 166 output_file.close() | 215 output_file.close() |
| 167 | 216 |
| 168 utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors") | 217 utils.log( |
| 218 "Completed.", total, "processed, ", count, "succeeded, ", errors, "errors" | |
| 219 ) | |
| 220 | |
| 169 | 221 |
| 170 def parse_score_mode(value): | 222 def parse_score_mode(value): |
| 171 if value == None or value == 'all': | 223 if value is None or value == "all": |
| 172 return FeatMaps.FeatMapScoreMode.All | 224 return FeatMaps.FeatMapScoreMode.All |
| 173 elif value == 'closest': | 225 elif value == "closest": |
| 174 return FeatMaps.FeatMapScoreMode.Closest | 226 return FeatMaps.FeatMapScoreMode.Closest |
| 175 elif value == 'best': | 227 elif value == "best": |
| 176 return FeatMaps.FeatMapScoreMode.Best | 228 return FeatMaps.FeatMapScoreMode.Best |
| 177 else: | 229 else: |
| 178 raise ValueError(value + " is not a valid scoring mode option") | 230 raise ValueError(value + " is not a valid scoring mode option") |
| 179 | 231 |
| 180 | 232 |
| 181 ### start main execution ######################################### | 233 # start main execution ######################################### |
| 234 | |
| 182 | 235 |
| 183 def main(): | 236 def main(): |
| 184 | 237 |
| 185 parser = argparse.ArgumentParser(description='SuCOS with RDKit') | 238 parser = argparse.ArgumentParser(description="SuCOS with RDKit") |
| 186 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') | 239 parser.add_argument( |
| 187 parser.add_argument('-r', '--refmol', help='Molecule to compare against in Molfile (.mol) or SDF (.sdf) format') | 240 "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)." |
| 188 parser.add_argument('--refmol-format', help="Format for the reference molecule (mol or sdf). " + | 241 ) |
| 189 "Only needed if files don't have the expected extensions") | 242 parser.add_argument( |
| 190 parser.add_argument('--refmolidx', help='Reference molecule index in SD file if not the first', type=int, default=1) | 243 "-r", |
| 191 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') | 244 "--refmol", |
| 192 parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score') | 245 help="Molecule to compare against in Molfile (.mol) or SDF (.sdf) format", |
| 193 parser.add_argument('--score_mode', choices=['all', 'closest', 'best'], | 246 ) |
| 194 help="choose the scoring mode for the feature map, default is 'all'.") | 247 parser.add_argument( |
| 248 "--refmol-format", | |
| 249 help="Format for the reference molecule (mol or sdf). " | |
| 250 + "Only needed if files don't have the expected extensions", | |
| 251 ) | |
| 252 parser.add_argument( | |
| 253 "--refmolidx", | |
| 254 help="Reference molecule index in SD file if not the first", | |
| 255 type=int, | |
| 256 default=1, | |
| 257 ) | |
| 258 parser.add_argument( | |
| 259 "-o", "--output", help="Output file in SDF format. Can be gzipped (*.gz)." | |
| 260 ) | |
| 261 parser.add_argument( | |
| 262 "--tanimoto", action="store_true", help="Include Tanimoto distance in score" | |
| 263 ) | |
| 264 parser.add_argument( | |
| 265 "--score_mode", | |
| 266 choices=["all", "closest", "best"], | |
| 267 help="choose the scoring mode for the feature map, default is 'all'.", | |
| 268 ) | |
| 195 | 269 |
| 196 args = parser.parse_args() | 270 args = parser.parse_args() |
| 197 utils.log("SuCOS Args: ", args) | 271 utils.log("SuCOS Args: ", args) |
| 198 | 272 |
| 199 score_mode = parse_score_mode(args.score_mode) | 273 score_mode = parse_score_mode(args.score_mode) |
| 200 | 274 |
| 201 process(args.refmol, args.input, args.output, refmol_index=args.refmolidx, | 275 process( |
| 202 refmol_format=args.refmol_format, tani=args.tanimoto, score_mode=score_mode) | 276 args.refmol, |
| 277 args.input, | |
| 278 args.output, | |
| 279 refmol_index=args.refmolidx, | |
| 280 refmol_format=args.refmol_format, | |
| 281 tani=args.tanimoto, | |
| 282 score_mode=score_mode, | |
| 283 ) | |
| 203 | 284 |
| 204 | 285 |
| 205 if __name__ == "__main__": | 286 if __name__ == "__main__": |
| 206 main() | 287 main() |
