Mercurial > repos > bgruening > enumerate_charges
diff dimorphite_dl.py @ 5:a9b3a2912c08 draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"
| author | bgruening | 
|---|---|
| date | Sat, 04 Dec 2021 16:38:07 +0000 | 
| parents | 9a2b0af78abc | 
| children | 
line wrap: on
 line diff
--- a/dimorphite_dl.py Wed Feb 17 12:57:32 2021 +0000 +++ b/dimorphite_dl.py Sat Dec 04 16:38:07 2021 +0000 @@ -19,8 +19,9 @@ """ from __future__ import print_function + +import argparse import os -import argparse import sys try: @@ -43,11 +44,12 @@ import rdkit from rdkit import Chem from rdkit.Chem import AllChem -except: +except Exception: msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/" print(msg) raise Exception(msg) + def main(params=None): """The main definition run when you call the script from the commandline. @@ -84,13 +86,14 @@ with open(args["output_file"], "w") as file: for protonated_smi in Protonate(args): file.write(protonated_smi + "\n") - elif "return_as_list" in args and args["return_as_list"] == True: + elif "return_as_list" in args and args["return_as_list"]: return list(Protonate(args)) else: # No output file specified. Just print it to the screen. for protonated_smi in Protonate(args): print(protonated_smi) + class MyParser(argparse.ArgumentParser): """Overwrite default parse so it displays help file on error. See https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu""" @@ -117,15 +120,18 @@ if file is None: file = sys.stdout self._print_message(self.format_help(), file) - print(""" + print( + """ examples: python dimorphite_dl.py --smiles_file sample_molecules.smi python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0 python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states - python dimorphite_dl.py --test""") + python dimorphite_dl.py --test""" + ) print("") + class ArgParseFuncs: """A namespace for storing functions that are useful for processing command-line arguments. To keep things organized.""" @@ -137,27 +143,57 @@ :return: A parser object. """ - parser = MyParser(description="Dimorphite 1.2: Creates models of " + - "appropriately protonated small moleucles. " + - "Apache 2.0 License. Copyright 2018 Jacob D. " + - "Durrant.") - parser.add_argument('--min_ph', metavar='MIN', type=float, default=6.4, - help='minimum pH to consider (default: 6.4)') - parser.add_argument('--max_ph', metavar='MAX', type=float, default=8.4, - help='maximum pH to consider (default: 8.4)') - parser.add_argument('--pka_precision', metavar='PRE', type=float, default=1.0, - help='pKa precision factor (number of standard devations, default: 1.0)') - parser.add_argument('--smiles', metavar='SMI', type=str, - help='SMILES string to protonate') - parser.add_argument('--smiles_file', metavar="FILE", type=str, - help='file that contains SMILES strings to protonate') - parser.add_argument('--output_file', metavar="FILE", type=str, - help='output file to write protonated SMILES (optional)') - parser.add_argument('--label_states', action="store_true", - help='label protonated SMILES with target state ' + \ - '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").') - parser.add_argument('--test', action="store_true", - help='run unit tests (for debugging)') + parser = MyParser( + description="Dimorphite 1.2: Creates models of " + + "appropriately protonated small moleucles. " + + "Apache 2.0 License. Copyright 2018 Jacob D. " + + "Durrant." + ) + parser.add_argument( + "--min_ph", + metavar="MIN", + type=float, + default=6.4, + help="minimum pH to consider (default: 6.4)", + ) + parser.add_argument( + "--max_ph", + metavar="MAX", + type=float, + default=8.4, + help="maximum pH to consider (default: 8.4)", + ) + parser.add_argument( + "--pka_precision", + metavar="PRE", + type=float, + default=1.0, + help="pKa precision factor (number of standard devations, default: 1.0)", + ) + parser.add_argument( + "--smiles", metavar="SMI", type=str, help="SMILES string to protonate" + ) + parser.add_argument( + "--smiles_file", + metavar="FILE", + type=str, + help="file that contains SMILES strings to protonate", + ) + parser.add_argument( + "--output_file", + metavar="FILE", + type=str, + help="output file to write protonated SMILES (optional)", + ) + parser.add_argument( + "--label_states", + action="store_true", + help="label protonated SMILES with target state " + + '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").', + ) + parser.add_argument( + "--test", action="store_true", help="run unit tests (for debugging)" + ) return parser @@ -170,11 +206,13 @@ :raises Exception: No SMILES in params. """ - defaults = {'min_ph' : 6.4, - 'max_ph' : 8.4, - 'pka_precision' : 1.0, - 'label_states' : False, - 'test' : False} + defaults = { + "min_ph": 6.4, + "max_ph": 8.4, + "pka_precision": 1.0, + "label_states": False, + "test": False, + } for key in defaults: if key not in args: @@ -194,12 +232,13 @@ # object. if "smiles" in args: if isinstance(args["smiles"], str): - args["smiles_file"] = StringIO(args["smiles"]) + args["smiles_file"] = StringIO(args["smiles"]) args["smiles_and_data"] = LoadSMIFile(args["smiles_file"]) return args + class UtilFuncs: """A namespace to store functions for manipulating mol objects. To keep things organized.""" @@ -215,15 +254,33 @@ # Get the reaction data rxn_data = [ - ['[Ov1-1:1]', '[Ov2+0:1]-[H]'], # To handle O- bonded to only one atom (add hydrogen). - ['[#7v4+1:1]-[H]', '[#7v3+0:1]'], # To handle N+ bonded to a hydrogen (remove hydrogen). - ['[Ov2-:1]', '[Ov2+0:1]'], # To handle O- bonded to two atoms. Should not be Negative. - ['[#7v3+1:1]', '[#7v3+0:1]'], # To handle N+ bonded to three atoms. Should not be positive. - ['[#7v2-1:1]', '[#7+0:1]-[H]'], # To handle N- Bonded to two atoms. Add hydrogen. + [ + "[Ov1-1:1]", + "[Ov2+0:1]-[H]", + ], # To handle O- bonded to only one atom (add hydrogen). + [ + "[#7v4+1:1]-[H]", + "[#7v3+0:1]", + ], # To handle N+ bonded to a hydrogen (remove hydrogen). + [ + "[Ov2-:1]", + "[Ov2+0:1]", + ], # To handle O- bonded to two atoms. Should not be Negative. + [ + "[#7v3+1:1]", + "[#7v3+0:1]", + ], # To handle N+ bonded to three atoms. Should not be positive. + [ + "[#7v2-1:1]", + "[#7+0:1]-[H]", + ], # To handle N- Bonded to two atoms. Add hydrogen. # ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'], # To # handle bad azide. Must be protonated. (Now handled elsewhere, before # SMILES converted to Mol object.) - ['[H]-[N:1]-[N:2]#[N:3]', '[N:1]=[N+1:2]=[N:3]-[H]'] # To handle bad azide. R-N-N#N should be R-N=[N+]=N + [ + "[H]-[N:1]-[N:2]#[N:3]", + "[N:1]=[N+1:2]=[N:3]-[H]", + ], # To handle bad azide. R-N-N#N should be R-N=[N+]=N ] # Add substructures and reactions (initially none) @@ -241,10 +298,15 @@ current_rxn_str = None for i, rxn_datum in enumerate(rxn_data): - reactant_smarts, product_smarts, substruct_match_mol, rxn_placeholder = rxn_datum + ( + reactant_smarts, + product_smarts, + substruct_match_mol, + rxn_placeholder, + ) = rxn_datum if mol.HasSubstructMatch(substruct_match_mol): if rxn_placeholder is None: - current_rxn_str = reactant_smarts + '>>' + product_smarts + current_rxn_str = reactant_smarts + ">>" + product_smarts current_rxn = AllChem.ReactionFromSmarts(current_rxn_str) rxn_data[i][3] = current_rxn # Update the placeholder. else: @@ -262,10 +324,10 @@ # to resanitize them. Make sure aromatic rings are shown as such This # catches all RDKit Errors. without the catchError and sanitizeOps the # Chem.SanitizeMol can crash the program. - sanitize_string = Chem.SanitizeMol( + sanitize_string = Chem.SanitizeMol( mol, sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL, - catchErrors = True + catchErrors=True, ) return mol if sanitize_string.name == "SANITIZE_NONE" else None @@ -321,6 +383,7 @@ print(*args, file=sys.stderr, **kwargs) + class LoadSMIFile(object): """A generator class for loading in the SMILES strings from a file, one at a time.""" @@ -388,37 +451,43 @@ # into a canonical form. Filter if failed. mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str) if mol is None: - UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) + UtilFuncs.eprint( + "WARNING: Skipping poorly formed SMILES string: " + line + ) return self.next() # Handle nuetralizing the molecules. Filter if failed. mol = UtilFuncs.neutralize_mol(mol) if mol is None: - UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) + UtilFuncs.eprint( + "WARNING: Skipping poorly formed SMILES string: " + line + ) return self.next() # Remove the hydrogens. try: mol = Chem.RemoveHs(mol) - except: - UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) + except Exception: + UtilFuncs.eprint( + "WARNING: Skipping poorly formed SMILES string: " + line + ) return self.next() if mol is None: - UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line) + UtilFuncs.eprint( + "WARNING: Skipping poorly formed SMILES string: " + line + ) return self.next() # Regenerate the smiles string (to standardize). new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True) - return { - "smiles": new_mol_string, - "data": splits[1:] - } + return {"smiles": new_mol_string, "data": splits[1:]} else: # Blank line? Go to next one. return self.next() + class Protonate(object): """A generator class for protonating SMILES strings, one at a time.""" @@ -491,8 +560,8 @@ smi = smile_and_datum["smiles"] data = smile_and_datum["data"] # Everything on SMILES line but the - # SMILES string itself (e.g., the - # molecule name). + # SMILES string itself (e.g., the + # molecule name). # Collect the data associated with this smiles (e.g., the molecule # name). @@ -516,8 +585,8 @@ # Only add new smiles if not already in the list. # for s in new_smis_to_perhaps_add: - # if not s in new_smis: - # new_smis.append(s) + # if not s in new_smis: + # new_smis.append(s) # In some cases, the script might generate redundant molecules. # Phosphonates, when the pH is between the two pKa values and the @@ -532,7 +601,9 @@ # Sometimes Dimorphite-DL generates molecules that aren't actually # possible. Simply convert these to mol objects to eliminate the bad # ones (that are None). - new_smis = [s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None] + new_smis = [ + s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None + ] # If there are no smi left, return the input one at the very least. # All generated forms have apparently been judged @@ -543,7 +614,7 @@ # If the user wants to see the target states, add those # to the ends of each line. if self.args["label_states"]: - states = '\t'.join([x[1] for x in sites]) + states = "\t".join([x[1] for x in sites]) new_lines = [x + "\t" + tag + "\t" + states for x in new_smis] else: new_lines = [x + "\t" + tag for x in new_smis] @@ -552,12 +623,15 @@ return self.next() + class ProtSubstructFuncs: """A namespace to store functions for loading the substructures that can be protonated. To keep things organized.""" @staticmethod - def load_protonation_substructs_calc_state_for_ph(min_ph=6.4, max_ph=8.4, pka_std_range=1): + def load_protonation_substructs_calc_state_for_ph( + min_ph=6.4, max_ph=8.4, pka_std_range=1 + ): """A pre-calculated list of R-groups with protonation sites, with their likely pKa bins. @@ -573,7 +647,7 @@ pwd = os.path.dirname(os.path.realpath(__file__)) site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts") - with open(site_structures_file, 'r') as substruct: + with open(site_structures_file, "r") as substruct: for line in substruct: line = line.strip() sub = {} @@ -584,7 +658,9 @@ sub["mol"] = Chem.MolFromSmarts(sub["smart"]) # NEED TO DIVIDE THIS BY 3s - pka_ranges = [splits[i:i+3] for i in range(2, len(splits)-1, 3)] + pka_ranges = [ + splits[i : i + 3] for i in range(2, len(splits) - 1, 3) + ] prot = [] for pka_range in pka_ranges: @@ -620,11 +696,11 @@ # This needs to be reassigned, and 'ERROR' should never make it past the # next set of checks. if min_pka <= max_ph and min_ph <= max_pka: - protonation_state = 'BOTH' + protonation_state = "BOTH" elif mean > max_ph: - protonation_state = 'PROTONATED' + protonation_state = "PROTONATED" else: - protonation_state = 'DEPROTONATED' + protonation_state = "DEPROTONATED" return protonation_state @@ -650,8 +726,8 @@ # Try to Add hydrogens. if failed return [] try: - mol = Chem.AddHs(mol) - except: + mol = Chem.AddHs(mol) + except Exception: UtilFuncs.eprint("ERROR: ", smi) return [] @@ -701,14 +777,14 @@ # Initialize the output list output_smis = [] - state_to_charge = {"DEPROTONATED": [-1], - "PROTONATED": [0], - "BOTH": [-1, 0]} + state_to_charge = {"DEPROTONATED": [-1], "PROTONATED": [0], "BOTH": [-1, 0]} charges = state_to_charge[target_prot_state] # Now make the actual smiles match the target protonation state. - output_smis = ProtSubstructFuncs.set_protonation_charge(smis, idx, charges, prot_site_name) + output_smis = ProtSubstructFuncs.set_protonation_charge( + smis, idx, charges, prot_site_name + ) return output_smis @@ -759,11 +835,12 @@ atom.SetFormalCharge(charge) # Convert back to SMILE and add to output - out_smile = Chem.MolToSmiles(mol, isomericSmiles=True,canonical=True) + out_smile = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True) output.append(out_smile) return output + class ProtectUnprotectFuncs: """A namespace for storing functions that are useful for protecting and unprotecting molecules. To keep things organized. We need to identify and @@ -779,7 +856,7 @@ """ for atom in mol.GetAtoms(): - atom.SetProp('_protected', '0') + atom.SetProp("_protected", "0") @staticmethod def protect_molecule(mol, match): @@ -793,7 +870,7 @@ for idx in match: atom = mol.GetAtomWithIdx(idx) - atom.SetProp('_protected', '1') + atom.SetProp("_protected", "1") @staticmethod def get_unprotected_matches(mol, substruct): @@ -829,6 +906,7 @@ return False return True + class TestFuncs: """A namespace for storing functions that perform tests on the code. To keep things organized.""" @@ -839,53 +917,158 @@ smis = [ # [input smiles, pka, protonated, deprotonated, category] - ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"], - ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"], - ["CC(=O)NOC(C)=O", "CC(=O)NOC(C)=O", "CC(=O)[N-]OC(C)=O", "Amide_electronegative"], - ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"], - ["Brc1ccc(C2NCCS2)cc1", "Brc1ccc(C2[NH2+]CCS2)cc1", "Brc1ccc(C2NCCS2)cc1", "Amines_primary_secondary_tertiary"], - ["CC(=O)[n+]1ccc(N)cc1", "CC(=O)[n+]1ccc([NH3+])cc1", "CC(=O)[n+]1ccc(N)cc1", "Anilines_primary"], - ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"], - ["Cc1ccccc1N(C)C", "Cc1ccccc1[NH+](C)C", "Cc1ccccc1N(C)C", "Anilines_tertiary"], - ["BrC1=CC2=C(C=C1)NC=C2", "Brc1ccc2[nH]ccc2c1", "Brc1ccc2[n-]ccc2c1", "Indole_pyrrole"], - ["O=c1cc[nH]cc1", "O=c1cc[nH]cc1", "O=c1cc[n-]cc1", "Aromatic_nitrogen_protonated"], - ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"], - ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"], - ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"], - ["C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)NC(=O)C(F)(F)F", "CC(=O)[N-]C(=O)C(F)(F)F", "Imide"], - ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"], - ["CC(C)(C)C(N(C)O)=O", "CN(O)C(=O)C(C)(C)C", "CN([O-])C(=O)C(C)(C)C", "N-hydroxyamide"], - ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"], - ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"], - ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"], - ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"], - ["Brc1cc(O)cc(Br)c1", "Oc1cc(Br)cc(Br)c1", "[O-]c1cc(Br)cc(Br)c1", "Phenol"], - ["CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc(S)cc1", "CC(=O)c1ccc([S-])cc1", "Phenyl_Thiol"], - ["C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)O)cc1", "C=CCOc1ccc(C(=O)[O-])cc1", "Phenyl_carboxyl"], - ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"], - ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"], - ["CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)O", "CC(C)OP(C)(=O)[O-]", "Phosphonate_ester"], - ["CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)NC1=O", "CC1(C)OC(=O)[N-]C1=O", "Ringed_imide1"], - ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"], - ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"], - ["COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)O)cc1", "COc1ccc(S(=O)[O-])cc1", "Sulfinic_acid"], - ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"], - ["CC(=O)CSCCS(O)(=O)=O", "CC(=O)CSCCS(=O)(=O)O", "CC(=O)CSCCS(=O)(=O)[O-]", "Sulfonate"], - ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"], - ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"], - ["Brc1cc[nH+]cc1", "Brc1cc[nH+]cc1", "Brc1ccncc1", "Aromatic_nitrogen_unprotonated"], - ["C=C(O)c1c(C)cc(C)cc1C", "C=C(O)c1c(C)cc(C)cc1C", "C=C([O-])c1c(C)cc(C)cc1C", "Vinyl_alcohol"], - ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"] + ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"], + ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"], + [ + "CC(=O)NOC(C)=O", + "CC(=O)NOC(C)=O", + "CC(=O)[N-]OC(C)=O", + "Amide_electronegative", + ], + ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"], + [ + "Brc1ccc(C2NCCS2)cc1", + "Brc1ccc(C2[NH2+]CCS2)cc1", + "Brc1ccc(C2NCCS2)cc1", + "Amines_primary_secondary_tertiary", + ], + [ + "CC(=O)[n+]1ccc(N)cc1", + "CC(=O)[n+]1ccc([NH3+])cc1", + "CC(=O)[n+]1ccc(N)cc1", + "Anilines_primary", + ], + ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"], + [ + "Cc1ccccc1N(C)C", + "Cc1ccccc1[NH+](C)C", + "Cc1ccccc1N(C)C", + "Anilines_tertiary", + ], + [ + "BrC1=CC2=C(C=C1)NC=C2", + "Brc1ccc2[nH]ccc2c1", + "Brc1ccc2[n-]ccc2c1", + "Indole_pyrrole", + ], + [ + "O=c1cc[nH]cc1", + "O=c1cc[nH]cc1", + "O=c1cc[n-]cc1", + "Aromatic_nitrogen_protonated", + ], + ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"], + ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"], + ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"], + [ + "C(F)(F)(F)C(=O)NC(=O)C", + "CC(=O)NC(=O)C(F)(F)F", + "CC(=O)[N-]C(=O)C(F)(F)F", + "Imide", + ], + ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"], + [ + "CC(C)(C)C(N(C)O)=O", + "CN(O)C(=O)C(C)(C)C", + "CN([O-])C(=O)C(C)(C)C", + "N-hydroxyamide", + ], + ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"], + ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"], + ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"], + ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"], + [ + "Brc1cc(O)cc(Br)c1", + "Oc1cc(Br)cc(Br)c1", + "[O-]c1cc(Br)cc(Br)c1", + "Phenol", + ], + [ + "CC(=O)c1ccc(S)cc1", + "CC(=O)c1ccc(S)cc1", + "CC(=O)c1ccc([S-])cc1", + "Phenyl_Thiol", + ], + [ + "C=CCOc1ccc(C(=O)O)cc1", + "C=CCOc1ccc(C(=O)O)cc1", + "C=CCOc1ccc(C(=O)[O-])cc1", + "Phenyl_carboxyl", + ], + ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"], + ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"], + [ + "CC(C)OP(C)(=O)O", + "CC(C)OP(C)(=O)O", + "CC(C)OP(C)(=O)[O-]", + "Phosphonate_ester", + ], + [ + "CC1(C)OC(=O)NC1=O", + "CC1(C)OC(=O)NC1=O", + "CC1(C)OC(=O)[N-]C1=O", + "Ringed_imide1", + ], + ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"], + ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"], + [ + "COc1ccc(S(=O)O)cc1", + "COc1ccc(S(=O)O)cc1", + "COc1ccc(S(=O)[O-])cc1", + "Sulfinic_acid", + ], + ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"], + [ + "CC(=O)CSCCS(O)(=O)=O", + "CC(=O)CSCCS(=O)(=O)O", + "CC(=O)CSCCS(=O)(=O)[O-]", + "Sulfonate", + ], + ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"], + ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"], + [ + "Brc1cc[nH+]cc1", + "Brc1cc[nH+]cc1", + "Brc1ccncc1", + "Aromatic_nitrogen_unprotonated", + ], + [ + "C=C(O)c1c(C)cc(C)cc1C", + "C=C(O)c1c(C)cc(C)cc1C", + "C=C([O-])c1c(C)cc(C)cc1C", + "Vinyl_alcohol", + ], + ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"], ] smis_phos = [ - ["O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O", "CCCCOP(=O)([O-])O", "CCCCOP(=O)([O-])[O-]", "Phosphate"], - ["CC(P(O)(O)=O)C", "CC(C)P(=O)(O)O", "CC(C)P(=O)([O-])O", "CC(C)P(=O)([O-])[O-]", "Phosphonate"] + [ + "O=P(O)(O)OCCCC", + "CCCCOP(=O)(O)O", + "CCCCOP(=O)([O-])O", + "CCCCOP(=O)([O-])[O-]", + "Phosphate", + ], + [ + "CC(P(O)(O)=O)C", + "CC(C)P(=O)(O)O", + "CC(C)P(=O)([O-])O", + "CC(C)P(=O)([O-])[O-]", + "Phosphonate", + ], ] # Load the average pKa values. - average_pkas = {l.split()[0].replace("*", ""):float(l.split()[3]) for l in open("site_substructures.smarts") if l.split()[0] not in ["Phosphate", "Phosphonate"]} - average_pkas_phos = {l.split()[0].replace("*", ""):[float(l.split()[3]), float(l.split()[6])] for l in open("site_substructures.smarts") if l.split()[0] in ["Phosphate", "Phosphonate"]} + average_pkas = { + l.split()[0].replace("*", ""): float(l.split()[3]) + for l in open("site_substructures.smarts") + if l.split()[0] not in ["Phosphate", "Phosphonate"] + } + average_pkas_phos = { + l.split()[0].replace("*", ""): [float(l.split()[3]), float(l.split()[6])] + for l in open("site_substructures.smarts") + if l.split()[0] in ["Phosphate", "Phosphonate"] + } print("Running Tests") print("=============") @@ -900,7 +1083,7 @@ "max_ph": -10000000, "pka_precision": 0.5, "smiles": "", - "label_states": True + "label_states": True, } for smi, protonated, deprotonated, category in smis: @@ -954,14 +1137,20 @@ args["min_ph"] = avg_pka args["max_ph"] = avg_pka - TestFuncs.test_check(args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"]) + TestFuncs.test_check( + args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"] + ) - avg_pka = 0.5 * (average_pkas_phos[category][0] + average_pkas_phos[category][1]) + avg_pka = 0.5 * ( + average_pkas_phos[category][0] + average_pkas_phos[category][1] + ) args["min_ph"] = avg_pka args["max_ph"] = avg_pka args["pka_precision"] = 5 # Should give all three - TestFuncs.test_check(args, [mix, deprotonated, protonated], ["BOTH", "BOTH"]) + TestFuncs.test_check( + args, [mix, deprotonated, protonated], ["BOTH", "BOTH"] + ) @staticmethod def test_check(args, expected_output, labels): @@ -981,28 +1170,56 @@ num_states = len(expected_output) - if (len(output) != num_states): - msg = args["smiles"] + " should have " + str(num_states) + \ - " states at at pH " + str(args["min_ph"]) + ": " + str(output) + if len(output) != num_states: + msg = ( + args["smiles"] + + " should have " + + str(num_states) + + " states at at pH " + + str(args["min_ph"]) + + ": " + + str(output) + ) print(msg) raise Exception(msg) - if (len(set([l[0] for l in output]) - set(expected_output)) != 0): - msg = args["smiles"] + " is not " + " AND ".join(expected_output) + \ - " at pH " + str(args["min_ph"]) + " - " + str(args["max_ph"]) + \ - "; it is " + " AND ".join([l[0] for l in output]) + if len(set([l[0] for l in output]) - set(expected_output)) != 0: + msg = ( + args["smiles"] + + " is not " + + " AND ".join(expected_output) + + " at pH " + + str(args["min_ph"]) + + " - " + + str(args["max_ph"]) + + "; it is " + + " AND ".join([l[0] for l in output]) + ) print(msg) raise Exception(msg) - if (len(set([l[1] for l in output]) - set(labels)) != 0): - msg = args["smiles"] + " not labeled as " + " AND ".join(labels) + \ - "; it is " + " AND ".join([l[1] for l in output]) + if len(set([l[1] for l in output]) - set(labels)) != 0: + msg = ( + args["smiles"] + + " not labeled as " + + " AND ".join(labels) + + "; it is " + + " AND ".join([l[1] for l in output]) + ) print(msg) raise Exception(msg) ph_range = sorted(list(set([args["min_ph"], args["max_ph"]]))) ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")" - print("(CORRECT) " + ph_range_str.ljust(10) + " " + args["smiles"] + " => " + " AND ".join([l[0] for l in output])) + print( + "(CORRECT) " + + ph_range_str.ljust(10) + + " " + + args["smiles"] + + " => " + + " AND ".join([l[0] for l in output]) + ) + def run(**kwargs): """A helpful, importable function for those who want to call Dimorphite-DL @@ -1019,6 +1236,7 @@ # Run the main function with the specified arguments. main(kwargs) + def run_with_mol_list(mol_lst, **kwargs): """A helpful, importable function for those who want to call Dimorphite-DL from another Python script rather than the command line. Note that this @@ -1037,10 +1255,13 @@ # Do a quick check to make sure the user input makes sense. for bad_arg in ["smiles", "smiles_file", "output_file", "test"]: if bad_arg in kwargs: - msg = "You're using Dimorphite-DL's run_with_mol_list(mol_lst, " + \ - "**kwargs) function, but you also passed the \"" + \ - bad_arg + "\" argument. Did you mean to use the " + \ - "run(**kwargs) function instead?" + msg = ( + "You're using Dimorphite-DL's run_with_mol_list(mol_lst, " + + '**kwargs) function, but you also passed the "' + + bad_arg + + '" argument. Did you mean to use the ' + + "run(**kwargs) function instead?" + ) print(msg) raise Exception(msg) @@ -1076,9 +1297,15 @@ m.SetProp(prop, str(val)) mols.append(m) else: - UtilFuncs.eprint("WARNING: Could not process molecule with SMILES string " + s + " and properties " + str(props)) + UtilFuncs.eprint( + "WARNING: Could not process molecule with SMILES string " + + s + + " and properties " + + str(props) + ) return mols + if __name__ == "__main__": main()
