Mercurial > repos > recetox > matchms_formatter
comparison formatter.py @ 10:5c0e5344edf3 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 5661cf2406e0616d7b2f4bee1b57ec43716088de
| author | recetox |
|---|---|
| date | Tue, 18 Oct 2022 10:59:57 +0000 |
| parents | 4ca9807c56e6 |
| children | 2f0545b02020 |
comparison
equal
deleted
inserted
replaced
| 9:4ca9807c56e6 | 10:5c0e5344edf3 |
|---|---|
| 1 import click | 1 import click |
| 2 from pandas import DataFrame, read_csv, to_numeric | 2 from matchms.importing import scores_from_json |
| 3 from pandas import DataFrame | |
| 3 | 4 |
| 4 | 5 |
| 5 def create_long_table(data: DataFrame, value_id: str) -> DataFrame: | 6 def create_long_table(data: DataFrame, value_id: str) -> DataFrame: |
| 6 """Convert the table from compact into long format. | 7 """Convert the table from compact into long format. |
| 7 See DataFrame.melt(...). | 8 See DataFrame.melt(...). |
| 61 filtered = data[data['score'] > t_score] | 62 filtered = data[data['score'] > t_score] |
| 62 filtered = filtered[filtered['matches'] > t_matches] | 63 filtered = filtered[filtered['matches'] > t_matches] |
| 63 return filtered | 64 return filtered |
| 64 | 65 |
| 65 | 66 |
| 66 def load_data(scores_filename: str, matches_filename: str) -> DataFrame: | 67 def scores_to_dataframes(scores): |
| 68 """Unpack scores from matchms.scores into two dataframes of scores and matches. | |
| 69 | |
| 70 Args: | |
| 71 scores (matchms.scores): matchms.scores object. | |
| 72 | |
| 73 Returns: | |
| 74 DataFrame: Scores | |
| 75 DataFrame: Matches | |
| 76 """ | |
| 77 query_names = [spectra.metadata['compound_name'] for spectra in scores.queries] | |
| 78 reference_names = [spectra.metadata['compound_name'] for spectra in scores.references] | |
| 79 | |
| 80 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) | |
| 81 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) | |
| 82 | |
| 83 return dataframe_scores, dataframe_matches | |
| 84 | |
| 85 | |
| 86 def load_data(scores_filename: str) -> DataFrame: | |
| 67 """Load data from filenames and join on compound id. | 87 """Load data from filenames and join on compound id. |
| 68 | 88 |
| 69 Args: | 89 Args: |
| 70 scores_filename (str): Path to scores table. | 90 scores_filename (str): Path to json file with serialized scores. |
| 71 matches_filename (str): Path to matches table. | |
| 72 | 91 |
| 73 Returns: | 92 Returns: |
| 74 DataFrame: Joined dataframe on compounds containing scores an matches in long format. | 93 DataFrame: Joined dataframe on compounds containing scores and matches in long format. |
| 75 """ | 94 """ |
| 76 matches = read_csv(matches_filename, sep="\t", index_col=0, header=0).apply(to_numeric) | 95 scores = scores_from_json(scores_filename) |
| 77 scores = read_csv(scores_filename, sep="\t", index_col=0, header=0).apply(to_numeric) | 96 scores, matches = scores_to_dataframes(scores) |
| 78 | 97 |
| 79 scores_long = create_long_table(scores, 'score') | 98 scores_long = create_long_table(scores, 'score') |
| 80 matches_long = create_long_table(matches, 'matches') | 99 matches_long = create_long_table(matches, 'matches') |
| 81 | 100 |
| 82 combined = join_df(matches_long, scores_long, on=['compound'], how='inner') | 101 combined = join_df(matches_long, scores_long, on=['compound'], how='inner') |
| 83 return combined | 102 return combined |
| 84 | 103 |
| 85 | 104 |
| 86 @click.group() | 105 @click.group() |
| 87 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) | 106 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) |
| 88 @click.option('--mf', 'matches_filename', type=click.Path(exists=True), required=True) | |
| 89 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) | 107 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) |
| 90 @click.pass_context | 108 @click.pass_context |
| 91 def cli(ctx, scores_filename, matches_filename, output_filename): | 109 def cli(ctx, scores_filename, output_filename): |
| 92 ctx.ensure_object(dict) | 110 ctx.ensure_object(dict) |
| 93 ctx.obj['data'] = load_data(scores_filename, matches_filename) | 111 ctx.obj['data'] = load_data(scores_filename) |
| 94 pass | 112 pass |
| 95 | 113 |
| 96 | 114 |
| 97 @cli.command() | 115 @cli.command() |
| 98 @click.option('--st', 'scores_threshold', type=float, required=True) | 116 @click.option('--st', 'scores_threshold', type=float, required=True) |
| 109 def get_top_k_data(ctx, k): | 127 def get_top_k_data(ctx, k): |
| 110 result = get_top_k_matches(ctx.obj['data'], k) | 128 result = get_top_k_matches(ctx.obj['data'], k) |
| 111 return result | 129 return result |
| 112 | 130 |
| 113 | 131 |
| 114 @cli.resultcallback() | 132 @cli.result_callback() |
| 115 def write_output(result: DataFrame, scores_filename, matches_filename, output_filename): | 133 def write_output(result: DataFrame, scores_filename, output_filename): |
| 116 result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'}) | 134 result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'}) |
| 117 result.to_csv(output_filename, sep="\t", index=False) | 135 result.to_csv(output_filename, sep="\t", index=False) |
| 118 | 136 |
| 119 | 137 |
| 120 if __name__ == '__main__': | 138 if __name__ == '__main__': |
