comparison matchms_similarity_wrapper.py @ 10:c3dd958cc4a5 draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 9bc547872c98a9c13c561d15e8990fe82bdc0e72"
author recetox
date Fri, 28 Jan 2022 16:22:06 +0000
parents
children ba9410f612bc
comparison
equal deleted inserted replaced
9:f06923bdd2f2 10:c3dd958cc4a5
1 import argparse
2 import sys
3
4 from matchms import calculate_scores
5 from matchms.importing import load_from_mgf, load_from_msp
6 from matchms.similarity import (
7 CosineGreedy,
8 CosineHungarian,
9 ModifiedCosine,
10 )
11 from pandas import DataFrame
12
13
14 def convert_precursor_mz(spectrum):
15 """
16 Check the presence of precursor m/z since it is needed for ModifiedCosine similarity metric. Convert to float if
17 needed, raise error if missing.
18 """
19
20 if "precursor_mz" in spectrum.metadata:
21 metadata = spectrum.metadata
22 metadata["precursor_mz"] = float(metadata["precursor_mz"])
23 spectrum.metadata = metadata
24 return spectrum
25 else:
26 raise ValueError("Precursor_mz missing. Apply 'add_precursor_mz' filter first.")
27
28
29 def main(argv):
30 parser = argparse.ArgumentParser(description="Compute MSP similarity scores")
31 parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.")
32 parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.")
33 parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.")
34 parser.add_argument("queries_filename", type=str, help="Path to query spectra.")
35 parser.add_argument("queries_format", type=str, help="Query spectra file format.")
36 parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.')
37 parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.")
38 parser.add_argument("mz_power", type=float, help="The power to raise mz to in the cosine function.")
39 parser.add_argument("intensity_power", type=float, help="The power to raise intensity to in the cosine function.")
40 parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.")
41 parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.")
42 args = parser.parse_args()
43
44 if args.queries_format == 'msp':
45 queries_spectra = list(load_from_msp(args.queries_filename))
46 elif args.queries_format == 'mgf':
47 queries_spectra = list(load_from_mgf(args.queries_filename))
48 else:
49 raise ValueError(f'File format {args.queries_format} not supported for query spectra.')
50
51 if args.symmetric:
52 reference_spectra = []
53 else:
54 if args.references_format == 'msp':
55 reference_spectra = list(load_from_msp(args.references_filename))
56 elif args.references_format == 'mgf':
57 reference_spectra = list(load_from_mgf(args.references_filename))
58 else:
59 raise ValueError(f'File format {args.references_format} not supported for reference spectra library.')
60
61 if args.similarity_metric == 'CosineGreedy':
62 similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power)
63 elif args.similarity_metric == 'CosineHungarian':
64 similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power)
65 elif args.similarity_metric == 'ModifiedCosine':
66 similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power)
67 reference_spectra = list(map(convert_precursor_mz, reference_spectra))
68 queries_spectra = list(map(convert_precursor_mz, queries_spectra))
69 else:
70 return -1
71
72 print("Calculating scores...")
73 scores = calculate_scores(
74 references=queries_spectra if args.symmetric else reference_spectra,
75 queries=queries_spectra,
76 similarity_function=similarity_metric,
77 is_symmetric=args.symmetric
78 )
79
80 write_outputs(args, scores)
81 return 0
82
83
84 def write_outputs(args, scores):
85 print("Storing outputs...")
86 query_names = [spectra.metadata['name'] for spectra in scores.queries]
87 reference_names = [spectra.metadata['name'] for spectra in scores.references]
88
89 # Write scores to dataframe
90 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names)
91 dataframe_scores.to_csv(args.output_filename_scores, sep='\t')
92
93 # Write number of matches to dataframe
94 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names)
95 dataframe_matches.to_csv(args.output_filename_matches, sep='\t')
96
97
98 if __name__ == "__main__":
99 main(argv=sys.argv[1:])
100 pass