Mercurial > repos > recetox > matchms_formatter
changeset 0:0a08bed94964 draft
"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 85f60c94ccb3cb7706694cbb7ff6d59dcb41c0c9"
author | recetox |
---|---|
date | Sat, 30 Oct 2021 13:48:25 +0000 |
parents | |
children | 364976b9aba6 |
files | formatter.py matchms_formatter.xml |
diffstat | 2 files changed, 228 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/formatter.py Sat Oct 30 13:48:25 2021 +0000 @@ -0,0 +1,121 @@ +import click +from pandas import DataFrame, read_csv + + +def create_long_table(data: DataFrame, value_id: str) -> DataFrame: + """Convert the table from compact into long format. + See DataFrame.melt(...). + + Args: + data (DataFrame): The data table to convert. + value_id (str): The name to assign to the added column through conversion to long format. + + Returns: + DataFrame: Table in long format. + """ + return data.transpose().melt(ignore_index=False, var_name='compound', value_name=value_id) + + +def join_df(x: DataFrame, y: DataFrame, on=[], how="inner") -> DataFrame: + """Shortcut functions to join to dataframes on columns and index + + Args: + x (DataFrame): Table X + y (DataFrame): Table Y + on (list, optional): Columns on which to join. Defaults to []. + how (str, optional): Join method, see DataFrame.join(...). Defaults to "inner". + + Returns: + DataFrame: Joined dataframe. + """ + df_x = x.set_index([x.index] + on) + df_y = y.set_index([y.index] + on) + combined = df_x.join(df_y, how=how) + return combined + + +def get_top_k_matches(data: DataFrame, k: int) -> DataFrame: + """Function to get top k matches from dataframe with scores. + + Args: + data (DataFrame): A table with score column. + k (int): Number of top scores to retrieve. + + Returns: + DataFrame: Table containing only the top k best matches for each compound. + """ + return data.groupby(level=0, group_keys=False).apply(DataFrame.nlargest, n=k, columns=['score']) + + +def filter_thresholds(data: DataFrame, t_score: float, t_matches: float) -> DataFrame: + """Filter a dataframe with scores and matches to only contain values above specified thresholds. + + Args: + data (DataFrame): Table to filter. + t_score (float): Score threshold. + t_matches (float): Matches threshold. + + Returns: + DataFrame: Filtered dataframe. + """ + filtered = data[data['score'] > t_score] + filtered = filtered[filtered['matches'] > t_matches] + return filtered + + +def load_data(scores_filename: str, matches_filename: str) -> DataFrame: + """Load data from filenames and join on compound id. + + Args: + scores_filename (str): Path to scores table. + matches_filename (str): Path to matches table. + + Returns: + DataFrame: Joined dataframe on compounds containing scores an matches in long format. + """ + matches = read_csv(matches_filename, sep='\t', index_col=0) + scores = read_csv(scores_filename, sep='\t', index_col=0) + + scores_long = create_long_table(scores, 'score') + matches_long = create_long_table(matches, 'matches') + + combined = join_df(matches_long, scores_long, on=['compound'], how='inner') + return combined + + +@click.group() +@click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) +@click.option('--mf', 'matches_filename', type=click.Path(exists=True), required=True) +@click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) +@click.pass_context +def cli(ctx, scores_filename, matches_filename, output_filename): + ctx.ensure_object(dict) + ctx.obj['data'] = load_data(scores_filename, matches_filename) + pass + + +@cli.command() +@click.option('--st', 'scores_threshold', type=float, required=True) +@click.option('--mt', 'matches_threshold', type=float, required=True) +@click.pass_context +def get_thresholded_data(ctx, scores_threshold, matches_threshold): + result = filter_thresholds(ctx.obj['data'], scores_threshold, matches_threshold) + return result + + +@cli.command() +@click.option('--k', 'k', type=int, required=True) +@click.pass_context +def get_top_k_data(ctx, k): + result = get_top_k_matches(ctx.obj['data'], k) + return result + + +@cli.resultcallback() +def write_output(result: DataFrame, scores_filename, matches_filename, output_filename): + result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'}) + result.to_csv(output_filename, sep="\t", index=False) + + +if __name__ == '__main__': + cli(obj={})
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/matchms_formatter.xml Sat Oct 30 13:48:25 2021 +0000 @@ -0,0 +1,107 @@ +<tool id="matchms_formatter" name="matchms output formatter" version="0.1.0+galaxy0" python_template_version="3.5"> + <description>reformat output tables of matchms</description> + + <creator> + <organization url="https://www.recetox.muni.cz/" name="RECETOX MUNI" /> + </creator> + + <requirements> + <requirement type="package" version="1.1.4">pandas</requirement> + <requirement type="package" version="8.0.1">click</requirement> + </requirements> + + <command detect_errors="aggressive"><![CDATA[ + sh ${matchms_formatter_cli} + ]]></command> + + <configfiles> + <configfile name="matchms_formatter_cli"> + python3 ${__tool_directory__}/formatter.py \ + --sf "$scores" \ + --mf "$matches" \ + --o "$output" \ + $method.selection \ + #if $method.selection == "get-thresholded-data" + --st $method.scores_threshold \ + --mt $method.matches_threshold + #else + --k $method.k + #end if + </configfile> + </configfiles> + + <inputs> + <param label="Scores Table" name="scores" type="data" format="tsv" help="Scores output table." /> + <param label="Matches Table" name="matches" type="data" format="tsv" help="Scores output table." /> + + <conditional name="method"> + <param name="selection" type="select" label="Formatting method"> + <option value="get-thresholded-data" selected="true">Thresholding</option> + <option value="get-top-k-data">Top K Matches</option> + </param> + <when value="get-thresholded-data"> + <param label="Scores Threshold" name="scores_threshold" type="float" value="0.6" min="0.0" max="1.0" + help="Threshold for matching score." /> + <param label="Matches Threshold" name="matches_threshold" type="integer" value="3" min="0" + help="Threshold for number of matches ions." /> + </when> + <when value="get-top-k-data"> + <param label="Top K" name="k" type="integer" value="5" help="K for top k match selection." /> + </when> + </conditional> + + </inputs> + <outputs> + <data label="${tool.name} (${method.selection}) on ${on_string}" name="output" format="tsv" /> + </outputs> + + <tests> + <test> + <param name="scores" value="scores_test2_out.tsv" ftype="tsv"/> + <param name="matches" value="matches_test2_out.tsv" ftype="tsv"/> + <param name="selection" value="get-thresholded-data"/> + <param name="scores_threshold" value="0.4"/> + <param name="matches_threshold" value="2"/> + <output name="output" file="test2_threshold_formatting.tsv" ftype="tsv" checksum="md5$8929cdac47252da638f066261ffc16b7"/> + </test> + <test> + <param name="scores" value="scores_test2_out.tsv" ftype="tsv"/> + <param name="matches" value="matches_test2_out.tsv" ftype="tsv"/> + <param name="selection" value="get-top-k-data"/> + <param name="k" value="3"/> + <output name="output" file="test2_topk_formatting.tsv" ftype="tsv" checksum="md5$a9186721aa2df2f9dbdef5021aa6bc26"/> + </test> + </tests> + + <help><![CDATA[ + Usage + This tool creates user friendly tables from the data matrices produces by matchms. + The tool can be operated on two modes based on (i) thresholds or (ii) top k matches. + + Input Table Format + The tool expects two data matrices with the format as depicted below. + The tool assumes the reference compound labels as row labels and the query labels as column labels (as naturally outputted by matchms). + + +----------+------+------+-----+ + | | C001 | C002 | ... | + +==========+======+======+=====+ + | Perylene | 0.1 | 0.0 | ... | + +----------+------+------+-----+ + | Glycine | 0.5 | 0.34 | ... | + +----------+------+------+-----+ + | ... | ... | ... | ... | + +----------+------+------+-----+ + + Output Table Format + +----------+-----------+---------+--------+ + | query | reference | matches | scores | + +==========+===========+=========+========+ + | C001 | Glycine | 6 | 0.5 | + +----------+-----------+---------+--------+ + | C002 | Glycine | 3 | 0.34 | + +----------+-----------+---------+--------+ + | ... | ... | ... | ... | + +----------+-----------+---------+--------+ + + ]]></help> +</tool> \ No newline at end of file