Mercurial > repos > recetox > matchms
comparison matchms_similarity_wrapper.py @ 10:c3dd958cc4a5 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 9bc547872c98a9c13c561d15e8990fe82bdc0e72"
| author | recetox |
|---|---|
| date | Fri, 28 Jan 2022 16:22:06 +0000 |
| parents | |
| children | ba9410f612bc |
comparison
equal
deleted
inserted
replaced
| 9:f06923bdd2f2 | 10:c3dd958cc4a5 |
|---|---|
| 1 import argparse | |
| 2 import sys | |
| 3 | |
| 4 from matchms import calculate_scores | |
| 5 from matchms.importing import load_from_mgf, load_from_msp | |
| 6 from matchms.similarity import ( | |
| 7 CosineGreedy, | |
| 8 CosineHungarian, | |
| 9 ModifiedCosine, | |
| 10 ) | |
| 11 from pandas import DataFrame | |
| 12 | |
| 13 | |
| 14 def convert_precursor_mz(spectrum): | |
| 15 """ | |
| 16 Check the presence of precursor m/z since it is needed for ModifiedCosine similarity metric. Convert to float if | |
| 17 needed, raise error if missing. | |
| 18 """ | |
| 19 | |
| 20 if "precursor_mz" in spectrum.metadata: | |
| 21 metadata = spectrum.metadata | |
| 22 metadata["precursor_mz"] = float(metadata["precursor_mz"]) | |
| 23 spectrum.metadata = metadata | |
| 24 return spectrum | |
| 25 else: | |
| 26 raise ValueError("Precursor_mz missing. Apply 'add_precursor_mz' filter first.") | |
| 27 | |
| 28 | |
| 29 def main(argv): | |
| 30 parser = argparse.ArgumentParser(description="Compute MSP similarity scores") | |
| 31 parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") | |
| 32 parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.") | |
| 33 parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.") | |
| 34 parser.add_argument("queries_filename", type=str, help="Path to query spectra.") | |
| 35 parser.add_argument("queries_format", type=str, help="Query spectra file format.") | |
| 36 parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') | |
| 37 parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") | |
| 38 parser.add_argument("mz_power", type=float, help="The power to raise mz to in the cosine function.") | |
| 39 parser.add_argument("intensity_power", type=float, help="The power to raise intensity to in the cosine function.") | |
| 40 parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.") | |
| 41 parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.") | |
| 42 args = parser.parse_args() | |
| 43 | |
| 44 if args.queries_format == 'msp': | |
| 45 queries_spectra = list(load_from_msp(args.queries_filename)) | |
| 46 elif args.queries_format == 'mgf': | |
| 47 queries_spectra = list(load_from_mgf(args.queries_filename)) | |
| 48 else: | |
| 49 raise ValueError(f'File format {args.queries_format} not supported for query spectra.') | |
| 50 | |
| 51 if args.symmetric: | |
| 52 reference_spectra = [] | |
| 53 else: | |
| 54 if args.references_format == 'msp': | |
| 55 reference_spectra = list(load_from_msp(args.references_filename)) | |
| 56 elif args.references_format == 'mgf': | |
| 57 reference_spectra = list(load_from_mgf(args.references_filename)) | |
| 58 else: | |
| 59 raise ValueError(f'File format {args.references_format} not supported for reference spectra library.') | |
| 60 | |
| 61 if args.similarity_metric == 'CosineGreedy': | |
| 62 similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) | |
| 63 elif args.similarity_metric == 'CosineHungarian': | |
| 64 similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) | |
| 65 elif args.similarity_metric == 'ModifiedCosine': | |
| 66 similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) | |
| 67 reference_spectra = list(map(convert_precursor_mz, reference_spectra)) | |
| 68 queries_spectra = list(map(convert_precursor_mz, queries_spectra)) | |
| 69 else: | |
| 70 return -1 | |
| 71 | |
| 72 print("Calculating scores...") | |
| 73 scores = calculate_scores( | |
| 74 references=queries_spectra if args.symmetric else reference_spectra, | |
| 75 queries=queries_spectra, | |
| 76 similarity_function=similarity_metric, | |
| 77 is_symmetric=args.symmetric | |
| 78 ) | |
| 79 | |
| 80 write_outputs(args, scores) | |
| 81 return 0 | |
| 82 | |
| 83 | |
| 84 def write_outputs(args, scores): | |
| 85 print("Storing outputs...") | |
| 86 query_names = [spectra.metadata['name'] for spectra in scores.queries] | |
| 87 reference_names = [spectra.metadata['name'] for spectra in scores.references] | |
| 88 | |
| 89 # Write scores to dataframe | |
| 90 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) | |
| 91 dataframe_scores.to_csv(args.output_filename_scores, sep='\t') | |
| 92 | |
| 93 # Write number of matches to dataframe | |
| 94 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) | |
| 95 dataframe_matches.to_csv(args.output_filename_matches, sep='\t') | |
| 96 | |
| 97 | |
| 98 if __name__ == "__main__": | |
| 99 main(argv=sys.argv[1:]) | |
| 100 pass |
