Mercurial > repos > bgruening > enumerate_charges
comparison dimorphite_dl.py @ 0:759010d2e9cd draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 20df7e562341cd30e89a14d6bde9054956fadc06"
| author | bgruening |
|---|---|
| date | Tue, 10 Mar 2020 16:55:50 +0000 |
| parents | |
| children | 9a2b0af78abc |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:759010d2e9cd |
|---|---|
| 1 # Copyright 2018 Jacob D. Durrant | |
| 2 # | |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 # you may not use this file except in compliance with the License. | |
| 5 # You may obtain a copy of the License at | |
| 6 # | |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 # | |
| 9 # Unless required by applicable law or agreed to in writing, software | |
| 10 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 # See the License for the specific language governing permissions and | |
| 13 # limitations under the License. | |
| 14 | |
| 15 """ | |
| 16 This script identifies and enumerates the possible protonation sites of SMILES | |
| 17 strings. | |
| 18 """ | |
| 19 | |
| 20 from __future__ import print_function | |
| 21 import copy | |
| 22 import os | |
| 23 import argparse | |
| 24 import sys | |
| 25 | |
| 26 try: | |
| 27 # Python2 | |
| 28 from StringIO import StringIO | |
| 29 except ImportError: | |
| 30 # Python3 | |
| 31 from io import StringIO | |
| 32 | |
| 33 # Always let the user know a help file is available. | |
| 34 print("\nFor help, use: python dimorphite_dl.py --help") | |
| 35 | |
| 36 # And always report citation information. | |
| 37 print("\nIf you use Dimorphite-DL in your research, please cite:") | |
| 38 print("Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An") | |
| 39 print("open-source program for enumerating the ionization states of drug-like small") | |
| 40 print("molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.\n") | |
| 41 | |
| 42 try: | |
| 43 import rdkit | |
| 44 from rdkit import Chem | |
| 45 from rdkit.Chem import AllChem | |
| 46 except: | |
| 47 msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/" | |
| 48 print(msg) | |
| 49 raise Exception(msg) | |
| 50 | |
| 51 def main(params=None): | |
| 52 """The main definition run when you call the script from the commandline. | |
| 53 | |
| 54 :param params: The parameters to use. Entirely optional. If absent, | |
| 55 defaults to None, in which case argments will be taken from | |
| 56 those given at the command line. | |
| 57 :param params: dict, optional | |
| 58 :return: Returns a list of the SMILES strings return_as_list parameter is | |
| 59 True. Otherwise, returns None. | |
| 60 """ | |
| 61 | |
| 62 parser = ArgParseFuncs.get_args() | |
| 63 args = vars(parser.parse_args()) | |
| 64 | |
| 65 # Add in any parameters in params. | |
| 66 if params is not None: | |
| 67 for k, v in params.items(): | |
| 68 args[k] = v | |
| 69 | |
| 70 # If being run from the command line, print out all parameters. | |
| 71 if __name__ == "__main__": | |
| 72 print("\nPARAMETERS:\n") | |
| 73 for k in sorted(args.keys()): | |
| 74 print(k.rjust(13) + ": " + str(args[k])) | |
| 75 print("") | |
| 76 | |
| 77 if args["test"]: | |
| 78 # Run tests. | |
| 79 TestFuncs.test() | |
| 80 else: | |
| 81 # Run protonation | |
| 82 if "output_file" in args and args["output_file"] is not None: | |
| 83 # An output file was specified, so write to that. | |
| 84 with open(args["output_file"], "w") as file: | |
| 85 for protonated_smi in Protonate(args): | |
| 86 file.write(protonated_smi + "\n") | |
| 87 elif "return_as_list" in args and args["return_as_list"] == True: | |
| 88 return list(Protonate(args)) | |
| 89 else: | |
| 90 # No output file specified. Just print it to the screen. | |
| 91 for protonated_smi in Protonate(args): | |
| 92 print(protonated_smi) | |
| 93 | |
| 94 class MyParser(argparse.ArgumentParser): | |
| 95 """Overwrite default parse so it displays help file on error. See | |
| 96 https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu""" | |
| 97 | |
| 98 def error(self, message): | |
| 99 """Overwrites the default error message. | |
| 100 | |
| 101 :param message: The default error message. | |
| 102 """ | |
| 103 | |
| 104 self.print_help() | |
| 105 msg = "ERROR: %s\n\n" % message | |
| 106 print(msg) | |
| 107 raise Exception(msg) | |
| 108 | |
| 109 def print_help(self, file=None): | |
| 110 """Overwrite the default print_help function | |
| 111 | |
| 112 :param file: Output file, defaults to None | |
| 113 """ | |
| 114 | |
| 115 print("") | |
| 116 | |
| 117 if file is None: | |
| 118 file = sys.stdout | |
| 119 self._print_message(self.format_help(), file) | |
| 120 print(""" | |
| 121 examples: | |
| 122 python dimorphite_dl.py --smiles_file sample_molecules.smi | |
| 123 python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0 | |
| 124 python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi | |
| 125 python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states | |
| 126 python dimorphite_dl.py --test""") | |
| 127 print("") | |
| 128 | |
| 129 class ArgParseFuncs: | |
| 130 """A namespace for storing functions that are useful for processing | |
| 131 command-line arguments. To keep things organized.""" | |
| 132 | |
| 133 @staticmethod | |
| 134 def get_args(): | |
| 135 """Gets the arguments from the command line. | |
| 136 | |
| 137 :return: A parser object. | |
| 138 """ | |
| 139 | |
| 140 parser = MyParser(description="Dimorphite 1.2: Creates models of " + | |
| 141 "appropriately protonated small moleucles. " + | |
| 142 "Apache 2.0 License. Copyright 2018 Jacob D. " + | |
| 143 "Durrant.") | |
| 144 parser.add_argument('--min_ph', metavar='MIN', type=float, default=6.4, | |
| 145 help='minimum pH to consider (default: 6.4)') | |
| 146 parser.add_argument('--max_ph', metavar='MAX', type=float, default=8.4, | |
| 147 help='maximum pH to consider (default: 8.4)') | |
| 148 parser.add_argument('--pka_precision', metavar='PRE', type=float, default=1.0, | |
| 149 help='pKa precision factor (number of standard devations, default: 1.0)') | |
| 150 parser.add_argument('--smiles', metavar='SMI', type=str, | |
| 151 help='SMILES string to protonate') | |
| 152 parser.add_argument('--smiles_file', metavar="FILE", type=str, | |
| 153 help='file that contains SMILES strings to protonate') | |
| 154 parser.add_argument('--output_file', metavar="FILE", type=str, | |
| 155 help='output file to write protonated SMILES (optional)') | |
| 156 parser.add_argument('--label_states', action="store_true", | |
| 157 help='label protonated SMILES with target state ' + \ | |
| 158 '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").') | |
| 159 parser.add_argument('--test', action="store_true", | |
| 160 help='run unit tests (for debugging)') | |
| 161 | |
| 162 return parser | |
| 163 | |
| 164 @staticmethod | |
| 165 def clean_args(args): | |
| 166 """Cleans and normalizes input parameters | |
| 167 | |
| 168 :param args: A dictionary containing the arguments. | |
| 169 :type args: dict | |
| 170 :raises Exception: No SMILES in params. | |
| 171 """ | |
| 172 | |
| 173 defaults = {'min_ph' : 6.4, | |
| 174 'max_ph' : 8.4, | |
| 175 'pka_precision' : 1.0, | |
| 176 'label_states' : False, | |
| 177 'test' : False} | |
| 178 | |
| 179 for key in defaults: | |
| 180 if key not in args: | |
| 181 args[key] = defaults[key] | |
| 182 | |
| 183 keys = list(args.keys()) | |
| 184 for key in keys: | |
| 185 if args[key] is None: | |
| 186 del args[key] | |
| 187 | |
| 188 if not "smiles" in args and not "smiles_file" in args: | |
| 189 msg = "Error: No SMILES in params. Use the -h parameter for help." | |
| 190 print(msg) | |
| 191 raise Exception(msg) | |
| 192 | |
| 193 # If the user provides a smiles string, turn it into a file-like StringIO | |
| 194 # object. | |
| 195 if "smiles" in args: | |
| 196 if isinstance(args["smiles"], str): | |
| 197 args["smiles_file"] = StringIO(args["smiles"]) | |
| 198 | |
| 199 args["smiles_and_data"] = LoadSMIFile(args["smiles_file"]) | |
| 200 | |
| 201 return args | |
| 202 | |
| 203 class UtilFuncs: | |
| 204 """A namespace to store functions for manipulating mol objects. To keep | |
| 205 things organized.""" | |
| 206 | |
| 207 @staticmethod | |
| 208 def neutralize_mol(mol): | |
| 209 """All molecules should be neuralized to the extent possible. The user | |
| 210 should not be allowed to specify the valence of the atoms in most cases. | |
| 211 | |
| 212 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol objet to be neutralized. | |
| 213 :return: The neutralized Mol object. | |
| 214 """ | |
| 215 | |
| 216 # Get the reaction data | |
| 217 rxn_data = [ | |
| 218 ['[Ov1-1:1]', '[Ov2+0:1]-[H]'], # To handle O- bonded to only one atom (add hydrogen). | |
| 219 ['[#7v4+1:1]-[H]', '[#7v3+0:1]'], # To handle N+ bonded to a hydrogen (remove hydrogen). | |
| 220 ['[Ov2-:1]', '[Ov2+0:1]'], # To handle O- bonded to two atoms. Should not be Negative. | |
| 221 ['[#7v3+1:1]', '[#7v3+0:1]'], # To handle N+ bonded to three atoms. Should not be positive. | |
| 222 ['[#7v2-1:1]', '[#7+0:1]-[H]'], # To handle N- Bonded to two atoms. Add hydrogen. | |
| 223 # ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'], # To | |
| 224 # handle bad azide. Must be protonated. (Now handled elsewhere, before | |
| 225 # SMILES converted to Mol object.) | |
| 226 ['[H]-[N:1]-[N:2]#[N:3]', '[N:1]=[N+1:2]=[N:3]-[H]'] # To handle bad azide. R-N-N#N should be R-N=[N+]=N | |
| 227 ] | |
| 228 | |
| 229 # Add substructures and reactions (initially none) | |
| 230 for i, rxn_datum in enumerate(rxn_data): | |
| 231 rxn_data[i].append(Chem.MolFromSmarts(rxn_datum[0])) | |
| 232 rxn_data[i].append(None) | |
| 233 | |
| 234 # Add hydrogens (respects valence, so incomplete). | |
| 235 # Chem.calcImplicitValence(mol) | |
| 236 mol.UpdatePropertyCache(strict=False) | |
| 237 mol = Chem.AddHs(mol) | |
| 238 | |
| 239 while True: # Keep going until all these issues have been resolved. | |
| 240 current_rxn = None # The reaction to perform. | |
| 241 current_rxn_str = None | |
| 242 | |
| 243 for i, rxn_datum in enumerate(rxn_data): | |
| 244 reactant_smarts, product_smarts, substruct_match_mol, rxn_placeholder = rxn_datum | |
| 245 if mol.HasSubstructMatch(substruct_match_mol): | |
| 246 if rxn_placeholder is None: | |
| 247 current_rxn_str = reactant_smarts + '>>' + product_smarts | |
| 248 current_rxn = AllChem.ReactionFromSmarts(current_rxn_str) | |
| 249 rxn_data[i][3] = current_rxn # Update the placeholder. | |
| 250 else: | |
| 251 current_rxn = rxn_data[i][3] | |
| 252 break | |
| 253 | |
| 254 # Perform the reaction if necessary | |
| 255 if current_rxn is None: # No reaction left, so break out of while loop. | |
| 256 break | |
| 257 else: | |
| 258 mol = current_rxn.RunReactants((mol,))[0][0] | |
| 259 mol.UpdatePropertyCache(strict=False) # Update valences | |
| 260 | |
| 261 # The mols have been altered from the reactions described above, we need | |
| 262 # to resanitize them. Make sure aromatic rings are shown as such This | |
| 263 # catches all RDKit Errors. without the catchError and sanitizeOps the | |
| 264 # Chem.SanitizeMol can crash the program. | |
| 265 sanitize_string = Chem.SanitizeMol( | |
| 266 mol, | |
| 267 sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, | |
| 268 catchErrors = True | |
| 269 ) | |
| 270 | |
| 271 return mol if sanitize_string.name == "SANITIZE_NONE" else None | |
| 272 | |
| 273 @staticmethod | |
| 274 def convert_smiles_str_to_mol(smiles_str): | |
| 275 """Given a SMILES string, check that it is actually a string and not a | |
| 276 None. Then try to convert it to an RDKit Mol Object. | |
| 277 | |
| 278 :param string smiles_str: The SMILES string. | |
| 279 :return: A rdkit.Chem.rdchem.Mol object, or None if it is the wrong type or | |
| 280 if it fails to convert to a Mol Obj | |
| 281 """ | |
| 282 | |
| 283 # Check that there are no type errors, ie Nones or non-string | |
| 284 # A non-string type will cause RDKit to hard crash | |
| 285 if smiles_str is None or type(smiles_str) is not str: | |
| 286 return None | |
| 287 | |
| 288 # Try to fix azides here. They are just tricky to deal with. | |
| 289 smiles_str = smiles_str.replace("N=N=N", "N=[N+]=N") | |
| 290 smiles_str = smiles_str.replace("NN#N", "N=[N+]=N") | |
| 291 | |
| 292 # Now convert to a mol object. Note the trick that is necessary to | |
| 293 # capture RDKit error/warning messages. See | |
| 294 # https://stackoverflow.com/questions/24277488/in-python-how-to-capture-the-stdout-from-a-c-shared-library-to-a-variable | |
| 295 stderr_fileno = sys.stderr.fileno() | |
| 296 stderr_save = os.dup(stderr_fileno) | |
| 297 stderr_pipe = os.pipe() | |
| 298 os.dup2(stderr_pipe[1], stderr_fileno) | |
| 299 os.close(stderr_pipe[1]) | |
| 300 | |
| 301 mol = Chem.MolFromSmiles(smiles_str) | |
| 302 | |
| 303 os.close(stderr_fileno) | |
| 304 os.close(stderr_pipe[0]) | |
| 305 os.dup2(stderr_save, stderr_fileno) | |
| 306 os.close(stderr_save) | |
| 307 | |
| 308 # Check that there are None type errors Chem.MolFromSmiles has sanitize on | |
| 309 # which means if there is even a small error in the SMILES (kekulize, | |
| 310 # nitrogen charge...) then mol=None. ie. | |
| 311 # Chem.MolFromSmiles("C[N]=[N]=[N]") = None this is an example of an | |
| 312 # nitrogen charge error. It is cased in a try statement to be overly | |
| 313 # cautious. | |
| 314 | |
| 315 return None if mol is None else mol | |
| 316 | |
| 317 @staticmethod | |
| 318 def eprint(*args, **kwargs): | |
| 319 """Error messages should be printed to STDERR. See | |
| 320 https://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python""" | |
| 321 | |
| 322 print(*args, file=sys.stderr, **kwargs) | |
| 323 | |
| 324 class LoadSMIFile(object): | |
| 325 """A generator class for loading in the SMILES strings from a file, one at | |
| 326 a time.""" | |
| 327 | |
| 328 def __init__(self, filename): | |
| 329 """Initializes this class. | |
| 330 | |
| 331 :param filename: The filename or file object (i.e., StringIO). | |
| 332 :type filename: str or StringIO | |
| 333 """ | |
| 334 | |
| 335 if type(filename) is str: | |
| 336 # It's a filename | |
| 337 self.f = open(filename, "r") | |
| 338 else: | |
| 339 # It's a file object (i.e., StringIO) | |
| 340 self.f = filename | |
| 341 | |
| 342 def __iter__(self): | |
| 343 """Returns this generator object. | |
| 344 | |
| 345 :return: This generator object. | |
| 346 :rtype: LoadSMIFile | |
| 347 """ | |
| 348 | |
| 349 return self | |
| 350 | |
| 351 def __next__(self): | |
| 352 """Ensure Python3 compatibility. | |
| 353 | |
| 354 :return: A dict, where the "smiles" key contains the canonical SMILES | |
| 355 string and the "data" key contains the remaining information | |
| 356 (e.g., the molecule name). | |
| 357 :rtype: dict | |
| 358 """ | |
| 359 | |
| 360 return self.next() | |
| 361 | |
| 362 def next(self): | |
| 363 """Get the data associated with the next line. | |
| 364 | |
| 365 :raises StopIteration: If there are no more lines left iin the file. | |
| 366 :return: A dict, where the "smiles" key contains the canonical SMILES | |
| 367 string and the "data" key contains the remaining information | |
| 368 (e.g., the molecule name). | |
| 369 :rtype: dict | |
| 370 """ | |
| 371 | |
| 372 line = self.f.readline() | |
| 373 | |
| 374 if line == "": | |
| 375 # EOF | |
| 376 self.f.close() | |
| 377 raise StopIteration() | |
| 378 return | |
| 379 | |
| 380 # Divide line into smi and data | |
| 381 splits = line.split() | |
| 382 if len(splits) != 0: | |
| 383 # Generate mol object | |
| 384 smiles_str = splits[0] | |
| 385 | |
| 386 # Convert from SMILES string to RDKIT Mol. This series of tests is | |
| 387 # to make sure the SMILES string is properly formed and to get it | |
| 388 # into a canonical form. Filter if failed. | |
| 389 mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str) | |
| 390 if mol is None: | |
| 391 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) | |
| 392 return self.next() | |
| 393 | |
| 394 # Handle nuetralizing the molecules. Filter if failed. | |
| 395 mol = UtilFuncs.neutralize_mol(mol) | |
| 396 if mol is None: | |
| 397 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) | |
| 398 return self.next() | |
| 399 | |
| 400 # Remove the hydrogens. | |
| 401 try: | |
| 402 mol = Chem.RemoveHs(mol) | |
| 403 except: | |
| 404 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) | |
| 405 return self.next() | |
| 406 | |
| 407 if mol is None: | |
| 408 UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) | |
| 409 return self.next() | |
| 410 | |
| 411 # Regenerate the smiles string (to standardize). | |
| 412 new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True) | |
| 413 | |
| 414 return { | |
| 415 "smiles": new_mol_string, | |
| 416 "data": splits[1:] | |
| 417 } | |
| 418 else: | |
| 419 # Blank line? Go to next one. | |
| 420 return self.next() | |
| 421 | |
| 422 class Protonate(object): | |
| 423 """A generator class for protonating SMILES strings, one at a time.""" | |
| 424 | |
| 425 def __init__(self, args): | |
| 426 """Initialize the generator. | |
| 427 | |
| 428 :param args: A dictionary containing the arguments. | |
| 429 :type args: dict | |
| 430 """ | |
| 431 | |
| 432 # Make the args an object variable variable. | |
| 433 self.args = args | |
| 434 | |
| 435 # A list to store the protonated SMILES strings associated with a | |
| 436 # single input model. | |
| 437 self.cur_prot_SMI = [] | |
| 438 | |
| 439 # Clean and normalize the args | |
| 440 self.args = ArgParseFuncs.clean_args(args) | |
| 441 | |
| 442 # Load the substructures that can be protonated. | |
| 443 self.subs = ProtSubstructFuncs.load_protonation_substructs_calc_state_for_ph( | |
| 444 self.args["min_ph"], self.args["max_ph"], self.args["pka_precision"] | |
| 445 ) | |
| 446 | |
| 447 def __iter__(self): | |
| 448 """Returns this generator object. | |
| 449 | |
| 450 :return: This generator object. | |
| 451 :rtype: Protonate | |
| 452 """ | |
| 453 | |
| 454 return self | |
| 455 | |
| 456 def __next__(self): | |
| 457 """Ensure Python3 compatibility. | |
| 458 | |
| 459 :return: A dict, where the "smiles" key contains the canonical SMILES | |
| 460 string and the "data" key contains the remaining information | |
| 461 (e.g., the molecule name). | |
| 462 :rtype: dict | |
| 463 """ | |
| 464 | |
| 465 return self.next() | |
| 466 | |
| 467 def next(self): | |
| 468 """Get the next protonated SMILES string. | |
| 469 | |
| 470 :raises StopIteration: If there are no more lines left iin the file. | |
| 471 :return: TODO A dict, where the "smiles" key contains the canonical SMILES | |
| 472 string and the "data" key contains the remaining information | |
| 473 (e.g., the molecule name). | |
| 474 :rtype: dict | |
| 475 """ | |
| 476 | |
| 477 # If there are any SMILES strings in self.cur_prot_SMI, just return | |
| 478 # the first one and update the list to include only the remaining. | |
| 479 if len(self.cur_prot_SMI) > 0: | |
| 480 first, self.cur_prot_SMI = self.cur_prot_SMI[0], self.cur_prot_SMI[1:] | |
| 481 return first | |
| 482 | |
| 483 # self.cur_prot_SMI is empty, so try to add more to it. | |
| 484 | |
| 485 # Get the next SMILES string from the input file. | |
| 486 try: | |
| 487 smile_and_datum = self.args["smiles_and_data"].next() | |
| 488 except StopIteration: | |
| 489 # There are no more input smiles strings... | |
| 490 raise StopIteration() | |
| 491 | |
| 492 smi = smile_and_datum["smiles"] | |
| 493 data = smile_and_datum["data"] # Everything on SMILES line but the | |
| 494 # SMILES string itself (e.g., the | |
| 495 # molecule name). | |
| 496 | |
| 497 # Collect the data associated with this smiles (e.g., the molecule | |
| 498 # name). | |
| 499 tag = " ".join(data) | |
| 500 | |
| 501 # sites is a list of (atom index, "PROTONATED|DEPROTONATED|BOTH"). | |
| 502 # Note that the second entry indicates what state the site SHOULD be | |
| 503 # in (not the one it IS in per the SMILES string). It's calculated | |
| 504 # based on the probablistic distributions obtained during training. | |
| 505 sites = ProtSubstructFuncs.get_prot_sites_and_target_states(smi, self.subs) | |
| 506 | |
| 507 new_smis = [smi] | |
| 508 for site in sites: | |
| 509 # Make a new smiles with the correct protonation state. Note that | |
| 510 # new_smis is a growing list. This is how multiple protonation | |
| 511 # sites are handled. | |
| 512 | |
| 513 # new_smis_to_perhaps_add = ProtSubstructFuncs.protonate_site(new_smis, site) | |
| 514 new_smis = ProtSubstructFuncs.protonate_site(new_smis, site) | |
| 515 # print(site, new_smis) # Good for debugging. | |
| 516 | |
| 517 # Only add new smiles if not already in the list. | |
| 518 # for s in new_smis_to_perhaps_add: | |
| 519 # if not s in new_smis: | |
| 520 # new_smis.append(s) | |
| 521 | |
| 522 # In some cases, the script might generate redundant molecules. | |
| 523 # Phosphonates, when the pH is between the two pKa values and the | |
| 524 # stdev value is big enough, for example, will generate two identical | |
| 525 # BOTH states. Let's remove this redundancy. | |
| 526 new_smis = list(set(new_smis)) | |
| 527 | |
| 528 # Deprotonating protonated aromatic nitrogen gives [nH-]. Change this | |
| 529 # to [n-]. This is a hack. | |
| 530 new_smis = [s.replace("[nH-]", "[n-]") for s in new_smis] | |
| 531 | |
| 532 # Sometimes Dimorphite-DL generates molecules that aren't actually | |
| 533 # possible. Simply convert these to mol objects to eliminate the bad | |
| 534 # ones (that are None). | |
| 535 new_smis = [s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None] | |
| 536 | |
| 537 # If there are no smi left, return the input one at the very least. | |
| 538 # All generated forms have apparently been judged | |
| 539 # inappropriate/mal-formed. | |
| 540 if len(new_smis) == 0: | |
| 541 new_smis = [smi] | |
| 542 | |
| 543 # If the user wants to see the target states, add those | |
| 544 # to the ends of each line. | |
| 545 if self.args["label_states"]: | |
| 546 states = '\t'.join([x[1] for x in sites]) | |
| 547 new_lines = [x + "\t" + tag + "\t" + states for x in new_smis] | |
| 548 else: | |
| 549 new_lines = [x + "\t" + tag for x in new_smis] | |
| 550 | |
| 551 self.cur_prot_SMI = new_lines | |
| 552 | |
| 553 return self.next() | |
| 554 | |
| 555 class ProtSubstructFuncs: | |
| 556 """A namespace to store functions for loading the substructures that can | |
| 557 be protonated. To keep things organized.""" | |
| 558 | |
| 559 @staticmethod | |
| 560 def load_protonation_substructs_calc_state_for_ph(min_ph=6.4, max_ph=8.4, pka_std_range=1): | |
| 561 """A pre-calculated list of R-groups with protonation sites, with their | |
| 562 likely pKa bins. | |
| 563 | |
| 564 :param float min_ph: The lower bound on the pH range, defaults to 6.4. | |
| 565 :param float max_ph: The upper bound on the pH range, defaults to 8.4. | |
| 566 :param pka_std_range: Basically the precision (stdev from predicted pKa to | |
| 567 consider), defaults to 1. | |
| 568 :return: A dict of the protonation substructions for the specified pH | |
| 569 range. | |
| 570 """ | |
| 571 | |
| 572 subs = [] | |
| 573 pwd = os.path.dirname(os.path.realpath(__file__)) | |
| 574 | |
| 575 site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts") | |
| 576 with open(site_structures_file, 'r') as substruct: | |
| 577 for line in substruct: | |
| 578 line = line.strip() | |
| 579 sub = {} | |
| 580 if line is not "": | |
| 581 splits = line.split() | |
| 582 sub["name"] = splits[0] | |
| 583 sub["smart"] = splits[1] | |
| 584 sub["mol"] = Chem.MolFromSmarts(sub["smart"]) | |
| 585 | |
| 586 # NEED TO DIVIDE THIS BY 3s | |
| 587 pka_ranges = [splits[i:i+3] for i in range(2, len(splits)-1, 3)] | |
| 588 | |
| 589 prot = [] | |
| 590 for pka_range in pka_ranges: | |
| 591 site = pka_range[0] | |
| 592 std = float(pka_range[2]) * pka_std_range | |
| 593 mean = float(pka_range[1]) | |
| 594 protonation_state = ProtSubstructFuncs.define_protonation_state( | |
| 595 mean, std, min_ph, max_ph | |
| 596 ) | |
| 597 | |
| 598 prot.append([site, protonation_state]) | |
| 599 | |
| 600 sub["prot_states_for_pH"] = prot | |
| 601 subs.append(sub) | |
| 602 return subs | |
| 603 | |
| 604 @staticmethod | |
| 605 def define_protonation_state(mean, std, min_ph, max_ph): | |
| 606 """Updates the substructure definitions to include the protonation state | |
| 607 based on the user-given pH range. The size of the pKa range is also based | |
| 608 on the number of standard deviations to be considered by the user param. | |
| 609 | |
| 610 :param float mean: The mean pKa. | |
| 611 :param float std: The precision (stdev). | |
| 612 :param float min_ph: The min pH of the range. | |
| 613 :param float max_ph: The max pH of the range. | |
| 614 :return: A string describing the protonation state. | |
| 615 """ | |
| 616 | |
| 617 min_pka = mean - std | |
| 618 max_pka = mean + std | |
| 619 | |
| 620 # This needs to be reassigned, and 'ERROR' should never make it past the | |
| 621 # next set of checks. | |
| 622 if min_pka <= max_ph and min_ph <= max_pka: | |
| 623 protonation_state = 'BOTH' | |
| 624 elif mean > max_ph: | |
| 625 protonation_state = 'PROTONATED' | |
| 626 else: | |
| 627 protonation_state = 'DEPROTONATED' | |
| 628 | |
| 629 return protonation_state | |
| 630 | |
| 631 @staticmethod | |
| 632 def get_prot_sites_and_target_states(smi, subs): | |
| 633 """For a single molecule, find all possible matches in the protonation | |
| 634 R-group list, subs. Items that are higher on the list will be matched | |
| 635 first, to the exclusion of later items. | |
| 636 | |
| 637 :param string smi: A SMILES string. | |
| 638 :param list subs: Substructure information. | |
| 639 :return: A list of protonation sites and their pKa bin. ('PROTONATED', | |
| 640 'BOTH', or 'DEPROTONATED') | |
| 641 """ | |
| 642 | |
| 643 # Convert the Smiles string (smi) to an RDKit Mol Obj | |
| 644 mol = UtilFuncs.convert_smiles_str_to_mol(smi) | |
| 645 | |
| 646 # Check Conversion worked | |
| 647 if mol is None: | |
| 648 UtilFuncs.eprint("ERROR: ", smi) | |
| 649 return [] | |
| 650 | |
| 651 # Try to Add hydrogens. if failed return [] | |
| 652 try: | |
| 653 mol = Chem.AddHs(mol) | |
| 654 except: | |
| 655 UtilFuncs.eprint("ERROR: ", smi) | |
| 656 return [] | |
| 657 | |
| 658 # Check adding Hs worked | |
| 659 if mol is None: | |
| 660 UtilFuncs.eprint("ERROR: ", smi) | |
| 661 return [] | |
| 662 | |
| 663 ProtectUnprotectFuncs.unprotect_molecule(mol) | |
| 664 protonation_sites = [] | |
| 665 | |
| 666 for item in subs: | |
| 667 smart = item["mol"] | |
| 668 if mol.HasSubstructMatch(smart): | |
| 669 matches = ProtectUnprotectFuncs.get_unprotected_matches(mol, smart) | |
| 670 prot = item["prot_states_for_pH"] | |
| 671 for match in matches: | |
| 672 # We want to move the site from being relative to the | |
| 673 # substructure, to the index on the main molecule. | |
| 674 for site in prot: | |
| 675 proton = int(site[0]) | |
| 676 category = site[1] | |
| 677 new_site = (match[proton], category, item["name"]) | |
| 678 | |
| 679 if not new_site in protonation_sites: | |
| 680 # Because sites must be unique. | |
| 681 protonation_sites.append(new_site) | |
| 682 | |
| 683 ProtectUnprotectFuncs.protect_molecule(mol, match) | |
| 684 | |
| 685 return protonation_sites | |
| 686 | |
| 687 @staticmethod | |
| 688 def protonate_site(smis, site): | |
| 689 """Given a list of SMILES strings, we protonate the site. | |
| 690 | |
| 691 :param list smis: The list of SMILES strings. | |
| 692 :param tuple site: Information about the protonation site. | |
| 693 (idx, target_prot_state, prot_site_name) | |
| 694 :return: A list of the appropriately protonated SMILES. | |
| 695 """ | |
| 696 | |
| 697 # Decouple the atom index and its target protonation state from the site | |
| 698 # tuple | |
| 699 idx, target_prot_state, prot_site_name = site | |
| 700 | |
| 701 # Initialize the output list | |
| 702 output_smis = [] | |
| 703 | |
| 704 state_to_charge = {"DEPROTONATED": [-1], | |
| 705 "PROTONATED": [0], | |
| 706 "BOTH": [-1, 0]} | |
| 707 | |
| 708 charges = state_to_charge[target_prot_state] | |
| 709 | |
| 710 # Now make the actual smiles match the target protonation state. | |
| 711 output_smis = ProtSubstructFuncs.set_protonation_charge(smis, idx, charges, prot_site_name) | |
| 712 | |
| 713 return output_smis | |
| 714 | |
| 715 @staticmethod | |
| 716 def set_protonation_charge(smis, idx, charges, prot_site_name): | |
| 717 """Sets the atomic charge on a particular site for a set of SMILES. | |
| 718 | |
| 719 :param list smis: A list of the SMILES strings. | |
| 720 :param int idx: The index of the atom to consider. | |
| 721 :param list charges: A list of the charges (ints) to assign at | |
| 722 this site. | |
| 723 :param string prot_site_name: The name of the protonation site. | |
| 724 :return: A list of the processed SMILES strings. | |
| 725 """ | |
| 726 | |
| 727 # Sets up the output list and the Nitrogen charge | |
| 728 output = [] | |
| 729 | |
| 730 for charge in charges: | |
| 731 # The charge for Nitrogens is 1 higher than others (i.e., protonated | |
| 732 # state is positively charged). | |
| 733 nitro_charge = charge + 1 | |
| 734 | |
| 735 # But there are a few nitrogen moieties where the acidic group is the | |
| 736 # neutral one. Amides are a good example. I gave some thought re. how | |
| 737 # to best flag these. I decided that those nitrogen-containing | |
| 738 # moieties where the acidic group is neutral (rather than positively | |
| 739 # charged) will have "*" in the name. | |
| 740 if "*" in prot_site_name: | |
| 741 nitro_charge = nitro_charge - 1 # Undo what was done previously. | |
| 742 | |
| 743 for smi in smis: | |
| 744 | |
| 745 # Convert smilesstring (smi) into a RDKit Mol | |
| 746 mol = UtilFuncs.convert_smiles_str_to_mol(smi) | |
| 747 | |
| 748 # Check that the conversion worked, skip if it fails | |
| 749 if mol is None: | |
| 750 continue | |
| 751 | |
| 752 atom = mol.GetAtomWithIdx(idx) | |
| 753 | |
| 754 # Assign the protonation charge, with special care for Nitrogens | |
| 755 element = atom.GetAtomicNum() | |
| 756 if element == 7: | |
| 757 atom.SetFormalCharge(nitro_charge) | |
| 758 else: | |
| 759 atom.SetFormalCharge(charge) | |
| 760 | |
| 761 # Convert back to SMILE and add to output | |
| 762 out_smile = Chem.MolToSmiles(mol, isomericSmiles=True,canonical=True) | |
| 763 output.append(out_smile) | |
| 764 | |
| 765 return output | |
| 766 | |
| 767 class ProtectUnprotectFuncs: | |
| 768 """A namespace for storing functions that are useful for protecting and | |
| 769 unprotecting molecules. To keep things organized. We need to identify and | |
| 770 mark groups that have been matched with a substructure.""" | |
| 771 | |
| 772 @staticmethod | |
| 773 def unprotect_molecule(mol): | |
| 774 """Sets the protected property on all atoms to 0. This also creates the | |
| 775 property for new molecules. | |
| 776 | |
| 777 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object. | |
| 778 :type mol: The rdkit Mol object with atoms unprotected. | |
| 779 """ | |
| 780 | |
| 781 for atom in mol.GetAtoms(): | |
| 782 atom.SetProp('_protected', '0') | |
| 783 | |
| 784 @staticmethod | |
| 785 def protect_molecule(mol, match): | |
| 786 """Given a 'match', a list of molecules idx's, we set the protected status | |
| 787 of each atom to 1. This will prevent any matches using that atom in the | |
| 788 future. | |
| 789 | |
| 790 :param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object to protect. | |
| 791 :param list match: A list of molecule idx's. | |
| 792 """ | |
| 793 | |
| 794 for idx in match: | |
| 795 atom = mol.GetAtomWithIdx(idx) | |
| 796 atom.SetProp('_protected', '1') | |
| 797 | |
| 798 @staticmethod | |
| 799 def get_unprotected_matches(mol, substruct): | |
| 800 """Finds substructure matches with atoms that have not been protected. | |
| 801 Returns list of matches, each match a list of atom idxs. | |
| 802 | |
| 803 :param rdkit.Chem.rdchem.Mol mol: The Mol object to consider. | |
| 804 :param string substruct: The SMARTS string of the substructure ot match. | |
| 805 :return: A list of the matches. Each match is itself a list of atom idxs. | |
| 806 """ | |
| 807 | |
| 808 matches = mol.GetSubstructMatches(substruct) | |
| 809 unprotected_matches = [] | |
| 810 for match in matches: | |
| 811 if ProtectUnprotectFuncs.is_match_unprotected(mol, match): | |
| 812 unprotected_matches.append(match) | |
| 813 return unprotected_matches | |
| 814 | |
| 815 @staticmethod | |
| 816 def is_match_unprotected(mol, match): | |
| 817 """Checks a molecule to see if the substructure match contains any | |
| 818 protected atoms. | |
| 819 | |
| 820 :param rdkit.Chem.rdchem.Mol mol: The Mol object to check. | |
| 821 :param list match: The match to check. | |
| 822 :return: A boolean, whether the match is present or not. | |
| 823 """ | |
| 824 | |
| 825 for idx in match: | |
| 826 atom = mol.GetAtomWithIdx(idx) | |
| 827 protected = atom.GetProp("_protected") | |
| 828 if protected == "1": | |
| 829 return False | |
| 830 return True | |
| 831 | |
| 832 class TestFuncs: | |
| 833 """A namespace for storing functions that perform tests on the code. To | |
| 834 keep things organized.""" | |
| 835 | |
| 836 @staticmethod | |
| 837 def test(): | |
| 838 """Tests all the 38 groups.""" | |
| 839 | |
| 840 smis = [ | |
| 841 # [input smiles, pka, protonated, deprotonated, category] | |
| 842 ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"], | |
| 843 ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"], | |
| 844 ["CC(=O)NOC(C)=O", "CC(=O)NOC(C)=O", "CC(=O)[N-]OC(C)=O", "Amide_electronegative"], | |
| 845 ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"], | |
| 846 ["Brc1ccc(C2NCCS2)cc1", "Brc1ccc(C2[NH2+]CCS2)cc1", "Brc1ccc(C2NCCS2)cc1", "Amines_primary_secondary_tertiary"], | |
| 847 ["CC(=O)[n+]1ccc(N)cc1", "CC(=O)[n+]1ccc([NH3+])cc1", "CC(=O)[n+]1ccc(N)cc1", "Anilines_primary"], | |
| 848 ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"], | |
| 849 ["Cc1ccccc1N(C)C", "Cc1ccccc1[NH+](C)C", "Cc1ccccc1N(C)C", "Anilines_tertiary"], | |
| 850 ["BrC1=CC2=C(C=C1)NC=C2", "Brc1ccc2[nH]ccc2c1", "Brc1ccc2[n-]ccc2c1", "Indole_pyrrole"], | |
| 851 ["O=c1cc[nH]cc1", "O=c1cc[nH]cc1", "O=c1cc[n-]cc1", "Aromatic_nitrogen_protonated"], | |
| 852 ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"], | |
| 853 ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"], | |
| 854 ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"], | |
| 855 ["C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)NC(=O)C(F)(F)F", "CC(=O)[N-]C(=O)C(F)(F)F", "Imide"], | |
| 856 ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"], | |
| 857 ["CC(C)(C)C(N(C)O)=O", "CN(O)C(=O)C(C)(C)C", "CN([O-])C(=O)C(C)(C)C", "N-hydroxyamide"], | |
| 858 ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"], | |
| 859 ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"], | |
| 860 ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"], | |
| 861 ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"], | |
| 862 ["Brc1cc(O)cc(Br)c1", "Oc1cc(Br)cc(Br)c1", "[O-]c1cc(Br)cc(Br)c1", "Phenol"], | |
| 863 ["CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc([S-])cc1", "Phenyl_Thiol"], | |
| 864 ["C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)[O-])cc1", "Phenyl_carboxyl"], | |
| 865 ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"], | |
| 866 ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"], | |
| 867 ["CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)[O-]", "Phosphonate_ester"], | |
| 868 ["CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)[N-]C1=O", "Ringed_imide1"], | |
| 869 ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"], | |
| 870 ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"], | |
| 871 ["COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)[O-])cc1", "Sulfinic_acid"], | |
| 872 ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"], | |
| 873 ["CC(=O)CSCCS(O)(=O)=O", "CC(=O)CSCCS(=O)(=O)O", "CC(=O)CSCCS(=O)(=O)[O-]", "Sulfonate"], | |
| 874 ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"], | |
| 875 ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"], | |
| 876 ["Brc1cc[nH+]cc1", "Brc1cc[nH+]cc1", "Brc1ccncc1", "Aromatic_nitrogen_unprotonated"], | |
| 877 ["C=C(O)c1c(C)cc(C)cc1C", "C=C(O)c1c(C)cc(C)cc1C", "C=C([O-])c1c(C)cc(C)cc1C", "Vinyl_alcohol"], | |
| 878 ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"] | |
| 879 ] | |
| 880 | |
| 881 smis_phos = [ | |
| 882 ["O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O", "CCCCOP(=O)([O-])O", "CCCCOP(=O)([O-])[O-]", "Phosphate"], | |
| 883 ["CC(P(O)(O)=O)C", "CC(C)P(=O)(O)O", "CC(C)P(=O)([O-])O", "CC(C)P(=O)([O-])[O-]", "Phosphonate"] | |
| 884 ] | |
| 885 | |
| 886 # Load the average pKa values. | |
| 887 average_pkas = {l.split()[0].replace("*", ""):float(l.split()[3]) for l in open("site_substructures.smarts") if l.split()[0] not in ["Phosphate", "Phosphonate"]} | |
| 888 average_pkas_phos = {l.split()[0].replace("*", ""):[float(l.split()[3]), float(l.split()[6])] for l in open("site_substructures.smarts") if l.split()[0] in ["Phosphate", "Phosphonate"]} | |
| 889 | |
| 890 print("Running Tests") | |
| 891 print("=============") | |
| 892 print("") | |
| 893 | |
| 894 print("Very Acidic (pH -10000000)") | |
| 895 print("--------------------------") | |
| 896 print("") | |
| 897 | |
| 898 args = { | |
| 899 "min_ph": -10000000, | |
| 900 "max_ph": -10000000, | |
| 901 "pka_precision": 0.5, | |
| 902 "smiles": "", | |
| 903 "label_states": True | |
| 904 } | |
| 905 | |
| 906 for smi, protonated, deprotonated, category in smis: | |
| 907 args["smiles"] = smi | |
| 908 TestFuncs.test_check(args, [protonated], ["PROTONATED"]) | |
| 909 | |
| 910 for smi, protonated, mix, deprotonated, category in smis_phos: | |
| 911 args["smiles"] = smi | |
| 912 TestFuncs.test_check(args, [protonated], ["PROTONATED"]) | |
| 913 | |
| 914 args["min_ph"] = 10000000 | |
| 915 args["max_ph"] = 10000000 | |
| 916 | |
| 917 print("") | |
| 918 print("Very Basic (pH 10000000)") | |
| 919 print("------------------------") | |
| 920 print("") | |
| 921 | |
| 922 for smi, protonated, deprotonated, category in smis: | |
| 923 args["smiles"] = smi | |
| 924 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"]) | |
| 925 | |
| 926 for smi, protonated, mix, deprotonated, category in smis_phos: | |
| 927 args["smiles"] = smi | |
| 928 TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"]) | |
| 929 | |
| 930 print("") | |
| 931 print("pH is Category pKa") | |
| 932 print("------------------") | |
| 933 print("") | |
| 934 | |
| 935 for smi, protonated, deprotonated, category in smis: | |
| 936 avg_pka = average_pkas[category] | |
| 937 | |
| 938 args["smiles"] = smi | |
| 939 args["min_ph"] = avg_pka | |
| 940 args["max_ph"] = avg_pka | |
| 941 | |
| 942 TestFuncs.test_check(args, [protonated, deprotonated], ["BOTH"]) | |
| 943 | |
| 944 for smi, protonated, mix, deprotonated, category in smis_phos: | |
| 945 args["smiles"] = smi | |
| 946 | |
| 947 avg_pka = average_pkas_phos[category][0] | |
| 948 args["min_ph"] = avg_pka | |
| 949 args["max_ph"] = avg_pka | |
| 950 | |
| 951 TestFuncs.test_check(args, [mix, protonated], ["BOTH"]) | |
| 952 | |
| 953 avg_pka = average_pkas_phos[category][1] | |
| 954 args["min_ph"] = avg_pka | |
| 955 args["max_ph"] = avg_pka | |
| 956 | |
| 957 TestFuncs.test_check(args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"]) | |
| 958 | |
| 959 avg_pka = 0.5 * (average_pkas_phos[category][0] + average_pkas_phos[category][1]) | |
| 960 args["min_ph"] = avg_pka | |
| 961 args["max_ph"] = avg_pka | |
| 962 args["pka_precision"] = 5 # Should give all three | |
| 963 | |
| 964 TestFuncs.test_check(args, [mix, deprotonated, protonated], ["BOTH", "BOTH"]) | |
| 965 | |
| 966 @staticmethod | |
| 967 def test_check(args, expected_output, labels): | |
| 968 """Tests most ionizable groups. The ones that can only loose or gain a single proton. | |
| 969 | |
| 970 :param args: The arguments to pass to protonate() | |
| 971 :param expected_output: A list of the expected SMILES-strings output. | |
| 972 :param labels: The labels. A list containing combo of BOTH, PROTONATED, | |
| 973 DEPROTONATED. | |
| 974 :raises Exception: Wrong number of states produced. | |
| 975 :raises Exception: Unexpected output SMILES. | |
| 976 :raises Exception: Wrong labels. | |
| 977 """ | |
| 978 | |
| 979 output = list(Protonate(args)) | |
| 980 output = [o.split() for o in output] | |
| 981 | |
| 982 num_states = len(expected_output) | |
| 983 | |
| 984 if (len(output) != num_states): | |
| 985 msg = args["smiles"] + " should have " + str(num_states) + \ | |
| 986 " states at at pH " + str(args["min_ph"]) + ": " + str(output) | |
| 987 print(msg) | |
| 988 raise Exception(msg) | |
| 989 | |
| 990 if (len(set([l[0] for l in output]) - set(expected_output)) != 0): | |
| 991 msg = args["smiles"] + " is not " + " AND ".join(expected_output) + \ | |
| 992 " at pH " + str(args["min_ph"]) + " - " + str(args["max_ph"]) + \ | |
| 993 "; it is " + " AND ".join([l[0] for l in output]) | |
| 994 print(msg) | |
| 995 raise Exception(msg) | |
| 996 | |
| 997 if (len(set([l[1] for l in output]) - set(labels)) != 0): | |
| 998 msg = args["smiles"] + " not labeled as " + " AND ".join(labels) + \ | |
| 999 "; it is " + " AND ".join([l[1] for l in output]) | |
| 1000 print(msg) | |
| 1001 raise Exception(msg) | |
| 1002 | |
| 1003 ph_range = sorted(list(set([args["min_ph"], args["max_ph"]]))) | |
| 1004 ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")" | |
| 1005 print("(CORRECT) " + ph_range_str.ljust(10) + " " + args["smiles"] + " => " + " AND ".join([l[0] for l in output])) | |
| 1006 | |
| 1007 def run(**kwargs): | |
| 1008 """A helpful, importable function for those who want to call Dimorphite-DL | |
| 1009 from another Python script rather than the command line. Note that this | |
| 1010 function accepts keyword arguments that match the command-line parameters | |
| 1011 exactly. If you want to pass and return a list of RDKit Mol objects, import | |
| 1012 run_with_mol_list() instead. | |
| 1013 | |
| 1014 :param **kwargs: For a complete description, run dimorphite_dl.py from the | |
| 1015 command line with the -h option. | |
| 1016 :type kwargs: dict | |
| 1017 """ | |
| 1018 | |
| 1019 # Run the main function with the specified arguments. | |
| 1020 main(kwargs) | |
| 1021 | |
| 1022 def run_with_mol_list(mol_lst, **kwargs): | |
| 1023 """A helpful, importable function for those who want to call Dimorphite-DL | |
| 1024 from another Python script rather than the command line. Note that this | |
| 1025 function is for passing Dimorphite-DL a list of RDKit Mol objects, together | |
| 1026 with command-line parameters. If you want to use only the same parameters | |
| 1027 that you would use from the command line, import run() instead. | |
| 1028 | |
| 1029 :param mol_lst: A list of rdkit.Chem.rdchem.Mol objects. | |
| 1030 :type mol_lst: list | |
| 1031 :raises Exception: If the **kwargs includes "smiles", "smiles_file", | |
| 1032 "output_file", or "test" parameters. | |
| 1033 :return: A list of properly protonated rdkit.Chem.rdchem.Mol objects. | |
| 1034 :rtype: list | |
| 1035 """ | |
| 1036 | |
| 1037 # Do a quick check to make sure the user input makes sense. | |
| 1038 for bad_arg in ["smiles", "smiles_file", "output_file", "test"]: | |
| 1039 if bad_arg in kwargs: | |
| 1040 msg = "You're using Dimorphite-DL's run_with_mol_list(mol_lst, " + \ | |
| 1041 "**kwargs) function, but you also passed the \"" + \ | |
| 1042 bad_arg + "\" argument. Did you mean to use the " + \ | |
| 1043 "run(**kwargs) function instead?" | |
| 1044 print(msg) | |
| 1045 raise Exception(msg) | |
| 1046 | |
| 1047 # Set the return_as_list flag so main() will return the protonated smiles | |
| 1048 # as a list. | |
| 1049 kwargs["return_as_list"] = True | |
| 1050 | |
| 1051 # Having reviewed the code, it will be very difficult to rewrite it so | |
| 1052 # that a list of Mol objects can be used directly. Intead, convert this | |
| 1053 # list of mols to smiles and pass that. Not efficient, but it will work. | |
| 1054 protonated_smiles_and_props = [] | |
| 1055 for m in mol_lst: | |
| 1056 props = m.GetPropsAsDict() | |
| 1057 kwargs["smiles"] = Chem.MolToSmiles(m, isomericSmiles=True) | |
| 1058 protonated_smiles_and_props.extend( | |
| 1059 [(s.split("\t")[0], props) for s in main(kwargs)] | |
| 1060 ) | |
| 1061 | |
| 1062 # Now convert the list of protonated smiles strings back to RDKit Mol | |
| 1063 # objects. Also, add back in the properties from the original mol objects. | |
| 1064 mols = [] | |
| 1065 for s, props in protonated_smiles_and_props: | |
| 1066 m = Chem.MolFromSmiles(s) | |
| 1067 if m: | |
| 1068 for prop, val in props.items(): | |
| 1069 if type(val) is int: | |
| 1070 m.SetIntProp(prop, val) | |
| 1071 elif type(val) is float: | |
| 1072 m.SetDoubleProp(prop, val) | |
| 1073 elif type(val) is bool: | |
| 1074 m.SetBoolProp(prop, val) | |
| 1075 else: | |
| 1076 m.SetProp(prop, str(val)) | |
| 1077 mols.append(m) | |
| 1078 else: | |
| 1079 UtilFuncs.eprint("WARNING: Could not process molecule with SMILES string " + s + " and properties " + str(props)) | |
| 1080 | |
| 1081 return mols | |
| 1082 | |
| 1083 if __name__ == "__main__": | |
| 1084 main() |
