Mercurial > repos > recetox > matchms
comparison matchms_similarity_wrapper.py @ 10:c3dd958cc4a5 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 9bc547872c98a9c13c561d15e8990fe82bdc0e72"
author | recetox |
---|---|
date | Fri, 28 Jan 2022 16:22:06 +0000 |
parents | |
children | ba9410f612bc |
comparison
equal
deleted
inserted
replaced
9:f06923bdd2f2 | 10:c3dd958cc4a5 |
---|---|
1 import argparse | |
2 import sys | |
3 | |
4 from matchms import calculate_scores | |
5 from matchms.importing import load_from_mgf, load_from_msp | |
6 from matchms.similarity import ( | |
7 CosineGreedy, | |
8 CosineHungarian, | |
9 ModifiedCosine, | |
10 ) | |
11 from pandas import DataFrame | |
12 | |
13 | |
14 def convert_precursor_mz(spectrum): | |
15 """ | |
16 Check the presence of precursor m/z since it is needed for ModifiedCosine similarity metric. Convert to float if | |
17 needed, raise error if missing. | |
18 """ | |
19 | |
20 if "precursor_mz" in spectrum.metadata: | |
21 metadata = spectrum.metadata | |
22 metadata["precursor_mz"] = float(metadata["precursor_mz"]) | |
23 spectrum.metadata = metadata | |
24 return spectrum | |
25 else: | |
26 raise ValueError("Precursor_mz missing. Apply 'add_precursor_mz' filter first.") | |
27 | |
28 | |
29 def main(argv): | |
30 parser = argparse.ArgumentParser(description="Compute MSP similarity scores") | |
31 parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.") | |
32 parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.") | |
33 parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.") | |
34 parser.add_argument("queries_filename", type=str, help="Path to query spectra.") | |
35 parser.add_argument("queries_format", type=str, help="Query spectra file format.") | |
36 parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.') | |
37 parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.") | |
38 parser.add_argument("mz_power", type=float, help="The power to raise mz to in the cosine function.") | |
39 parser.add_argument("intensity_power", type=float, help="The power to raise intensity to in the cosine function.") | |
40 parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.") | |
41 parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.") | |
42 args = parser.parse_args() | |
43 | |
44 if args.queries_format == 'msp': | |
45 queries_spectra = list(load_from_msp(args.queries_filename)) | |
46 elif args.queries_format == 'mgf': | |
47 queries_spectra = list(load_from_mgf(args.queries_filename)) | |
48 else: | |
49 raise ValueError(f'File format {args.queries_format} not supported for query spectra.') | |
50 | |
51 if args.symmetric: | |
52 reference_spectra = [] | |
53 else: | |
54 if args.references_format == 'msp': | |
55 reference_spectra = list(load_from_msp(args.references_filename)) | |
56 elif args.references_format == 'mgf': | |
57 reference_spectra = list(load_from_mgf(args.references_filename)) | |
58 else: | |
59 raise ValueError(f'File format {args.references_format} not supported for reference spectra library.') | |
60 | |
61 if args.similarity_metric == 'CosineGreedy': | |
62 similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power) | |
63 elif args.similarity_metric == 'CosineHungarian': | |
64 similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power) | |
65 elif args.similarity_metric == 'ModifiedCosine': | |
66 similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power) | |
67 reference_spectra = list(map(convert_precursor_mz, reference_spectra)) | |
68 queries_spectra = list(map(convert_precursor_mz, queries_spectra)) | |
69 else: | |
70 return -1 | |
71 | |
72 print("Calculating scores...") | |
73 scores = calculate_scores( | |
74 references=queries_spectra if args.symmetric else reference_spectra, | |
75 queries=queries_spectra, | |
76 similarity_function=similarity_metric, | |
77 is_symmetric=args.symmetric | |
78 ) | |
79 | |
80 write_outputs(args, scores) | |
81 return 0 | |
82 | |
83 | |
84 def write_outputs(args, scores): | |
85 print("Storing outputs...") | |
86 query_names = [spectra.metadata['name'] for spectra in scores.queries] | |
87 reference_names = [spectra.metadata['name'] for spectra in scores.references] | |
88 | |
89 # Write scores to dataframe | |
90 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) | |
91 dataframe_scores.to_csv(args.output_filename_scores, sep='\t') | |
92 | |
93 # Write number of matches to dataframe | |
94 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) | |
95 dataframe_matches.to_csv(args.output_filename_matches, sep='\t') | |
96 | |
97 | |
98 if __name__ == "__main__": | |
99 main(argv=sys.argv[1:]) | |
100 pass |