Mercurial > repos > recetox > biotransformer
diff wrapper_biotransformer.py @ 4:77f693bb14ac draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/biotransformer commit 9a2276670c6ebf147ccd2cdd1cc54b306af3d20c"
author | recetox |
---|---|
date | Mon, 11 Apr 2022 10:09:39 +0000 |
parents | 6080aee7c4f6 |
children | c0fe7ad30ade |
line wrap: on
line diff
--- a/wrapper_biotransformer.py Wed Jan 13 11:17:53 2021 +0000 +++ b/wrapper_biotransformer.py Mon Apr 11 10:09:39 2022 +0000 @@ -8,8 +8,8 @@ openbabel.obErrorLog.StopLogging() -# function for translating inchi to smiles def InchiToSmiles(df): + '''Translate inchi to smiles''' sm = [] for item in df['InChI']: tmp = pybel.readstring("inchi", item) @@ -18,80 +18,58 @@ executable = ["biotransformer"] -# executable_r = ["Rscript", "inchi_to_smiles.r"] argv = sys.argv[1:] -if "-icsv" in argv: - icsv = argv.pop(argv.index("-icsv") + 1) - argv.remove("-icsv") +icsv = argv.pop(argv.index("-icsv") + 1) +argv.remove("-icsv") +ocsv = argv.pop(argv.index("-ocsv") + 1) +argv.remove("-ocsv") +ocsv_dup = argv.pop(argv.index("-ocsvDup") + 1) +argv.remove("-ocsvDup") +ocsv_dup2 = argv.pop(argv.index("-ocsvDup2") + 1) +argv.remove("-ocsvDup2") - if "-ocsv" not in argv: - sys.stderr.write("excpected -ocsv parameter\n") - sys.exit(1) - ocsv = argv.pop(argv.index("-ocsv") + 1) - argv.remove("-ocsv") - ocsv_dup = argv.pop(argv.index("-ocsvDup") + 1) - argv.remove("-ocsvDup") - ocsv_dup2 = argv.pop(argv.index("-ocsvDup2") + 1) - argv.remove("-ocsvDup2") +in_df = pandas.read_csv(icsv, header=None) +out_df1 = pandas.DataFrame() # all results +out_df2 = pandas.DataFrame() # filtered results based on 6 columns +out_df3 = pandas.DataFrame() # filtered results based on 3 columns - in_df = pandas.read_csv(icsv, header=None) - out_df1 = pandas.DataFrame() # all results - out_df2 = pandas.DataFrame() # filtered results based on 6 columns - out_df3 = pandas.DataFrame() # filtered results based on 3 columns - - tmp2 = pandas.DataFrame() - tmp3 = pandas.DataFrame() +smList1 = [] # list with smiles string +smList2 = [] +smList3 = [] +for _, (smiles,) in in_df.iterrows(): + with tempfile.NamedTemporaryFile() as out: + print("Working on compound: " + smiles) + if not re.search(r'\.', smiles): + subprocess.run(executable + argv + ["-ismi", smiles] + ["-ocsv", out.name]) + try: + bio_out = pandas.read_csv(out.name) + tmp2 = bio_out.drop_duplicates(subset=["InChI", "InChIKey", "Synonyms", "Molecular formula", "Major Isotope Mass", "ALogP"]) + tmp3 = bio_out.drop_duplicates(subset=["Molecular formula", "Major Isotope Mass", "ALogP"]) - smList1 = [] # list with smiles string - smList2 = [] - smList3 = [] - for _, (smiles,) in in_df.iterrows(): - with tempfile.NamedTemporaryFile() as out: - print("Working on compound: " + smiles) - if not re.search(r'\.', smiles): - subprocess.run(executable + argv + ["-ismi", smiles] + ["-ocsv", out.name]) - try: - tmp2 = pandas.read_csv(out.name) - tmp3 = pandas.read_csv(out.name) - tmp2.drop_duplicates(inplace=True, subset=["InChI", "InChIKey", "Synonyms", "Molecular formula", "Major Isotope Mass", "ALogP"]) - tmp3.drop_duplicates(inplace=True, subset=["Molecular formula", "Major Isotope Mass", "ALogP"]) - smList2.append([smiles] * tmp2.shape[0]) - smList3.append([smiles] * tmp3.shape[0]) - out_df1 = pandas.concat([out_df1, pandas.read_csv(out.name)]) - out_df2 = pandas.concat([out_df2, tmp2]) - out_df3 = pandas.concat([out_df3, tmp3]) - smList1.append([smiles] * pandas.read_csv(out.name).shape[0]) - except pandas.errors.EmptyDataError: - continue - else: - print("ERROR: Input compound cannot be a mixture.") - smList1 = sum(smList1, []) # merge sublists into one list - smList2 = sum(smList2, []) - smList3 = sum(smList3, []) + smList1.append([smiles] * bio_out.shape[0]) + smList2.append([smiles] * tmp2.shape[0]) + smList3.append([smiles] * tmp3.shape[0]) - out_df1.insert(0, "SMILES query", smList1) - out_df1.drop_duplicates(inplace=True) - out_df1.insert(1, "SMILES target", InchiToSmiles(out_df1)) - out_df1.to_csv(ocsv) + out_df1 = pandas.concat([out_df1, bio_out]) + out_df2 = pandas.concat([out_df2, tmp2]) + out_df3 = pandas.concat([out_df3, tmp3]) + except pandas.errors.EmptyDataError: + continue + else: + print("ERROR: Input compound cannot be a mixture.") +smList1 = sum(smList1, []) # merge sublists into one list +smList2 = sum(smList2, []) +smList3 = sum(smList3, []) - out_df2.insert(0, "SMILES query", smList2) - out_df3.insert(0, "SMILES query", smList3) - out_df2.drop_duplicates(inplace=True) - out_df3.drop_duplicates(inplace=True) - out_df2.insert(1, "SMILES target", InchiToSmiles(out_df2)) - out_df3.insert(1, "SMILES target", InchiToSmiles(out_df3)) - # out_df.drop_duplicates(inplace=True, subset=["InChI", "InChIKey", "Synonyms", "Molecular formula", "Major Isotope Mass", "ALogP"]) - out_df2.to_csv(ocsv_dup) - out_df3.to_csv(ocsv_dup2) -else: - # code = subprocess.run(executable + argv).returncode - # sys.exit(code) - subprocess.run(executable + argv) - smile = argv.pop(argv.index("-ismi") + 1) - tmp = pandas.DataFrame() - out = argv.pop(argv.index("-ocsv") + 1) - tmp = pandas.read_csv(out) # reads created output file - tmp.insert(0, "SMILES query", smile) # add SMILES string for query - tmp.insert(1, "SMILES target", InchiToSmiles(tmp)) # add SMILES string for target - tmp.to_csv(out) +out_df1.insert(0, "SMILES query", smList1) +out_df1.insert(1, "SMILES target", InchiToSmiles(out_df1)) +out_df1.to_csv(ocsv, sep ='\t') + +out_df2.insert(0, "SMILES query", smList2) +out_df2.insert(1, "SMILES target", InchiToSmiles(out_df2)) +out_df2.to_csv(ocsv_dup, sep ='\t') + +out_df3.insert(0, "SMILES query", smList3) +out_df3.insert(1, "SMILES target", InchiToSmiles(out_df3)) +out_df3.to_csv(ocsv_dup2, sep ='\t')