Mercurial > repos > recetox > matchms_formatter

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/formatter.py	Sat Oct 30 13:48:25 2021 +0000
@@ -0,0 +1,121 @@
+import click
+from pandas import DataFrame, read_csv
+
+
+def create_long_table(data: DataFrame, value_id: str) -> DataFrame:
+    """Convert the table from compact into long format.
+    See DataFrame.melt(...).
+
+    Args:
+        data (DataFrame): The data table to convert.
+        value_id (str): The name to assign to the added column through conversion to long format.
+
+    Returns:
+        DataFrame: Table in long format.
+    """
+    return data.transpose().melt(ignore_index=False, var_name='compound', value_name=value_id)
+
+
+def join_df(x: DataFrame, y: DataFrame, on=[], how="inner") -> DataFrame:
+    """Shortcut functions to join to dataframes on columns and index
+
+    Args:
+        x (DataFrame): Table X
+        y (DataFrame): Table Y
+        on (list, optional): Columns on which to join. Defaults to [].
+        how (str, optional): Join method, see DataFrame.join(...). Defaults to "inner".
+
+    Returns:
+        DataFrame: Joined dataframe.
+    """
+    df_x = x.set_index([x.index] + on)
+    df_y = y.set_index([y.index] + on)
+    combined = df_x.join(df_y, how=how)
+    return combined
+
+
+def get_top_k_matches(data: DataFrame, k: int) -> DataFrame:
+    """Function to get top k matches from dataframe with scores.
+
+    Args:
+        data (DataFrame): A table with score column.
+        k (int): Number of top scores to retrieve.
+
+    Returns:
+        DataFrame: Table containing only the top k best matches for each compound.
+    """
+    return data.groupby(level=0, group_keys=False).apply(DataFrame.nlargest, n=k, columns=['score'])
+
+
+def filter_thresholds(data: DataFrame, t_score: float, t_matches: float) -> DataFrame:
+    """Filter a dataframe with scores and matches to only contain values above specified thresholds.
+
+    Args:
+        data (DataFrame): Table to filter.
+        t_score (float): Score threshold.
+        t_matches (float): Matches threshold.
+
+    Returns:
+        DataFrame: Filtered dataframe.
+    """
+    filtered = data[data['score'] > t_score]
+    filtered = filtered[filtered['matches'] > t_matches]
+    return filtered
+
+
+def load_data(scores_filename: str, matches_filename: str) -> DataFrame:
+    """Load data from filenames and join on compound id.
+
+    Args:
+        scores_filename (str): Path to scores table.
+        matches_filename (str): Path to matches table.
+
+    Returns:
+        DataFrame: Joined dataframe on compounds containing scores an matches in long format.
+    """
+    matches = read_csv(matches_filename, sep='\t', index_col=0)
+    scores = read_csv(scores_filename, sep='\t', index_col=0)
+
+    scores_long = create_long_table(scores, 'score')
+    matches_long = create_long_table(matches, 'matches')
+
+    combined = join_df(matches_long, scores_long, on=['compound'], how='inner')
+    return combined
+
+
+@click.group()
+@click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True)
+@click.option('--mf', 'matches_filename', type=click.Path(exists=True), required=True)
+@click.option('--o', 'output_filename', type=click.Path(writable=True), required=True)
+@click.pass_context
+def cli(ctx, scores_filename, matches_filename, output_filename):
+    ctx.ensure_object(dict)
+    ctx.obj['data'] = load_data(scores_filename, matches_filename)
+    pass
+
+
+@cli.command()
+@click.option('--st', 'scores_threshold', type=float, required=True)
+@click.option('--mt', 'matches_threshold', type=float, required=True)
+@click.pass_context
+def get_thresholded_data(ctx, scores_threshold, matches_threshold):
+    result = filter_thresholds(ctx.obj['data'], scores_threshold, matches_threshold)
+    return result
+
+
+@cli.command()
+@click.option('--k', 'k', type=int, required=True)
+@click.pass_context
+def get_top_k_data(ctx, k):
+    result = get_top_k_matches(ctx.obj['data'], k)
+    return result
+
+
+@cli.resultcallback()
+def write_output(result: DataFrame, scores_filename, matches_filename, output_filename):
+    result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'})
+    result.to_csv(output_filename, sep="\t", index=False)
+
+
+if __name__ == '__main__':
+    cli(obj={})
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/matchms_formatter.xml	Sat Oct 30 13:48:25 2021 +0000
@@ -0,0 +1,107 @@
+<tool id="matchms_formatter" name="matchms output formatter" version="0.1.0+galaxy0" python_template_version="3.5">
+    <description>reformat output tables of matchms</description>
+
+    <creator>
+        <organization url="https://www.recetox.muni.cz/" name="RECETOX MUNI" />
+    </creator>
+
+    <requirements>
+        <requirement type="package" version="1.1.4">pandas</requirement>
+        <requirement type="package" version="8.0.1">click</requirement>
+    </requirements>
+
+    <command detect_errors="aggressive"><![CDATA[
+        sh ${matchms_formatter_cli}
+    ]]></command>
+
+    <configfiles>
+        <configfile name="matchms_formatter_cli">
+            python3 ${__tool_directory__}/formatter.py \
+            --sf "$scores" \
+            --mf "$matches" \
+            --o "$output" \
+            $method.selection \
+            #if $method.selection == "get-thresholded-data"
+                --st $method.scores_threshold \
+                --mt $method.matches_threshold
+            #else
+                --k $method.k
+            #end if
+        </configfile>
+    </configfiles>
+
+    <inputs>
+        <param label="Scores Table" name="scores" type="data" format="tsv" help="Scores output table." />
+        <param label="Matches Table" name="matches" type="data" format="tsv" help="Scores output table." />
+
+        <conditional name="method">
+            <param name="selection" type="select" label="Formatting method">
+                <option value="get-thresholded-data" selected="true">Thresholding</option>
+                <option value="get-top-k-data">Top K Matches</option>
+            </param>
+            <when value="get-thresholded-data">
+                <param label="Scores Threshold" name="scores_threshold" type="float" value="0.6" min="0.0" max="1.0"
+                help="Threshold for matching score." />
+                <param label="Matches Threshold" name="matches_threshold" type="integer" value="3" min="0"
+                help="Threshold for number of matches ions." />
+            </when>
+            <when value="get-top-k-data">
+                <param label="Top K" name="k" type="integer" value="5" help="K for top k match selection." />
+            </when>
+        </conditional>
+
+    </inputs>
+    <outputs>
+        <data label="${tool.name} (${method.selection}) on ${on_string}" name="output" format="tsv" />
+    </outputs>
+
+    <tests>
+        <test>
+            <param name="scores" value="scores_test2_out.tsv" ftype="tsv"/>
+            <param name="matches" value="matches_test2_out.tsv" ftype="tsv"/>
+            <param name="selection" value="get-thresholded-data"/>
+            <param name="scores_threshold" value="0.4"/>
+            <param name="matches_threshold" value="2"/>
+            <output name="output" file="test2_threshold_formatting.tsv" ftype="tsv" checksum="md5$8929cdac47252da638f066261ffc16b7"/>
+        </test>
+        <test>
+            <param name="scores" value="scores_test2_out.tsv" ftype="tsv"/>
+            <param name="matches" value="matches_test2_out.tsv" ftype="tsv"/>
+            <param name="selection" value="get-top-k-data"/>
+            <param name="k" value="3"/>
+            <output name="output" file="test2_topk_formatting.tsv" ftype="tsv" checksum="md5$a9186721aa2df2f9dbdef5021aa6bc26"/>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    Usage
+        This tool creates user friendly tables from the data matrices produces by matchms.
+        The tool can be operated on two modes based on (i) thresholds or (ii) top k matches.
+
+    Input Table Format
+        The tool expects two data matrices with the format as depicted below.
+        The tool assumes the reference compound labels as row labels and the query labels as column labels (as naturally outputted by matchms).
+
+        +----------+------+------+-----+
+        |          | C001 | C002 | ... |
+        +==========+======+======+=====+
+        | Perylene | 0.1  | 0.0  | ... |
+        +----------+------+------+-----+
+        | Glycine  | 0.5  | 0.34 | ... |
+        +----------+------+------+-----+
+        |   ...    | ...  | ...  | ... |
+        +----------+------+------+-----+
+
+    Output Table Format
+        +----------+-----------+---------+--------+
+        | query    | reference | matches | scores |
+        +==========+===========+=========+========+
+        | C001     | Glycine   |      6  | 0.5    |
+        +----------+-----------+---------+--------+
+        | C002     | Glycine   |     3   | 0.34   |
+        +----------+-----------+---------+--------+
+        |   ...    | ...       | ...     | ...    |
+        +----------+-----------+---------+--------+
+
+    ]]></help>
+</tool>
\ No newline at end of file