matchms_formatter: formatter.py comparison

comparison formatter.py @ 10:5c0e5344edf3 draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 5661cf2406e0616d7b2f4bee1b57ec43716088de

author	recetox
date	Tue, 18 Oct 2022 10:59:57 +0000
parents	4ca9807c56e6
children	2f0545b02020

comparison

equal deleted inserted replaced

-:4ca9807c56e6
+:5c0e5344edf3
 import click
-from pandas import DataFrame, read_csv, to_numeric
+from matchms.importing import scores_from_json
+from pandas import DataFrame
 def create_long_table(data: DataFrame, value_id: str) -> DataFrame:
 """Convert the table from compact into long format.
 See DataFrame.melt(...).
 filtered = data[data['score'] > t_score]
 filtered = filtered[filtered['matches'] > t_matches]
 return filtered
-def load_data(scores_filename: str, matches_filename: str) -> DataFrame:
+def scores_to_dataframes(scores):
+"""Unpack scores from matchms.scores into two dataframes of scores and matches.
+Args:
+scores (matchms.scores): matchms.scores object.
+Returns:
+DataFrame: Scores
+DataFrame: Matches
+"""
+query_names = [spectra.metadata['compound_name'] for spectra in scores.queries]
+reference_names = [spectra.metadata['compound_name'] for spectra in scores.references]
+dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names)
+dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names)
+return dataframe_scores, dataframe_matches
+def load_data(scores_filename: str) -> DataFrame:
 """Load data from filenames and join on compound id.
 Args:
-scores_filename (str): Path to scores table.
+scores_filename (str): Path to json file with serialized scores.
-matches_filename (str): Path to matches table.
 Returns:
-DataFrame: Joined dataframe on compounds containing scores an matches in long format.
+DataFrame: Joined dataframe on compounds containing scores and matches in long format.
 """
-matches = read_csv(matches_filename, sep="\t", index_col=0, header=0).apply(to_numeric)
+scores = scores_from_json(scores_filename)
-scores = read_csv(scores_filename, sep="\t", index_col=0, header=0).apply(to_numeric)
+scores, matches = scores_to_dataframes(scores)
 scores_long = create_long_table(scores, 'score')
 matches_long = create_long_table(matches, 'matches')
 combined = join_df(matches_long, scores_long, on=['compound'], how='inner')
 return combined
 @click.group()
 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True)
-@click.option('--mf', 'matches_filename', type=click.Path(exists=True), required=True)
 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True)
 @click.pass_context
-def cli(ctx, scores_filename, matches_filename, output_filename):
+def cli(ctx, scores_filename, output_filename):
 ctx.ensure_object(dict)
-ctx.obj['data'] = load_data(scores_filename, matches_filename)
+ctx.obj['data'] = load_data(scores_filename)
 pass
 @cli.command()
 @click.option('--st', 'scores_threshold', type=float, required=True)
 def get_top_k_data(ctx, k):
 result = get_top_k_matches(ctx.obj['data'], k)
 return result
-@cli.resultcallback()
+@cli.result_callback()
-def write_output(result: DataFrame, scores_filename, matches_filename, output_filename):
+def write_output(result: DataFrame, scores_filename, output_filename):
 result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'})
 result.to_csv(output_filename, sep="\t", index=False)
 if __name__ == '__main__':

Mercurial > repos > recetox > matchms_formatter

comparison formatter.py @ 10:5c0e5344edf3 draft