Mercurial > repos > bgruening > flexynesis_utils
diff flexynesis_utils.xml @ 0:433a5f3f68a1 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit b2463fb68d0ae54864d87718ee72f5e063aa4587
author | bgruening |
---|---|
date | Tue, 24 Jun 2025 05:55:20 +0000 |
parents | |
children | e5ecfffcfe45 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/flexynesis_utils.xml Tue Jun 24 05:55:20 2025 +0000 @@ -0,0 +1,396 @@ +<tool id="flexynesis_utils" name="Flexynesis utils" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>Utility functions for Flexynesis</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <required_files> + <include path="flexynesis_plot.py" /> + <include path="flexynesis_utils.py" /> + </required_files> + <command detect_errors="exit_code"><![CDATA[ + @CHECK_NON_COMMERCIAL_USE@ + mkdir -p inputs/ output/ && + #if $utils_conditional.util != "compute_ami_ari" and $utils_conditional.util != "split_data" and $utils_conditional.util != "binarize": + ln -s '$utils_conditional.X' 'inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext' && + #end if + #if $utils_conditional.util != "split_data" and $utils_conditional.util != "binarize": + ln -s '$utils_conditional.labels' 'inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext' && + cat '$flexynesis_utils_config' && + python '$flexynesis_utils_config' + #end if + #if $utils_conditional.util == "split_data": + ln -s '$utils_conditional.clin' inputs/clin.csv && + #set $omics_names = [] + #for $omics_file in $utils_conditional.omics: + ln -s '$omics_file' 'inputs/${omics_file.element_identifier}.${omics_file.ext}' && + #silent $omics_names.append('inputs/' + str($omics_file.element_identifier) + '.' + str($omics_file.ext)) + #end for + + python '$__tool_directory__/flexynesis_utils.py' + --util split + --clin inputs/clin.csv + --omics '$(",".join($omics_names))' + --split $utils_conditional.split + --out output + #end if + #if $utils_conditional.util == "binarize": + ln -s '$utils_conditional.mutation' 'inputs/${utils_conditional.mutation.element_identifier}.${utils_conditional.mutation.ext}' && + python '$__tool_directory__/flexynesis_utils.py' + --util binarize + --mutation 'inputs/${utils_conditional.mutation.element_identifier}.${utils_conditional.mutation.ext}' + --gene_idx $utils_conditional.gene_idx + --sample_idx $utils_conditional.sample_idx + --out output + #end if + ]]></command> + <configfiles> + <configfile name="flexynesis_utils_config"><![CDATA[ +import sys +sys.path.append('$__tool_directory__/') + +import numpy as np +import pandas as pd +from flexynesis import ( + louvain_clustering, + get_optimal_clusters, + compute_ami_ari, + k_means_clustering +) + +from flexynesis_plot import ( + load_omics +) + +#if $utils_conditional.util == "louvain_clustering": +label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') +X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') + +cluster_labels, G, partition = louvain_clustering( + #if $utils_conditional.threshold != "": + threshold=$utils_conditional.threshold, + #end if + #if $utils_conditional.k != "": + k=$utils_conditional.k, + #end if + X=X) +cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['louvain_cluster']) +label_data = label_data.merge(cluster_df[['louvain_cluster']], left_index=True, right_index=True, how='left') + +output_path = f"output/clustered_labels.csv" +label_data.to_csv(output_path, index=True) + +#else if $utils_conditional.util == "get_optimal_clusters": +label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') +X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') + +kmeans_cluster_labels, optimal_k, silhouette_scores = get_optimal_clusters( + data=X, + min_k=$utils_conditional.min_k, + max_k=$utils_conditional.max_k) + +print(f"Optimal number of clusters: {optimal_k}\n") +print(f"Silhouette scores: \n{silhouette_scores}") + +cluster_df = pd.DataFrame(data=kmeans_cluster_labels, index=X.index, columns=['optimal_kmeans_cluster']) +label_data = label_data.merge(cluster_df[['optimal_kmeans_cluster']], left_index=True, right_index=True, how='left') + +output_path = f"output/optimal_clusters_labels.csv" +label_data.to_csv(output_path, index=True) + +#else if $utils_conditional.util == "k_means_clustering": +label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') +X = load_omics('inputs/$utils_conditional.X.element_identifier.$utils_conditional.X.ext') + +cluster_labels, kmeans = k_means_clustering( + data=X, + k=$utils_conditional.k) + +print(f"{kmeans}") +cluster_df = pd.DataFrame(data=cluster_labels, index=X.index, columns=['kmeans_cluster']) +label_data = label_data.merge(cluster_df[['kmeans_cluster']], left_index=True, right_index=True, how='left') + +output_path = f"output/kmeans_labels.csv" +label_data.to_csv(output_path, index=True) + +#else if $utils_conditional.util == "compute_ami_ari": +label_data = load_omics('inputs/$utils_conditional.labels.element_identifier.$utils_conditional.labels.ext') + +true_label = label_data.columns[$utils_conditional.true_label-2] +predicted_label = label_data.columns[$utils_conditional.predicted_label-2] + +true_labels = label_data[true_label] +predicted_labels = label_data[predicted_label] + +ami_ari = compute_ami_ari(labels1=true_labels, labels2=predicted_labels) + +print(f"AMI: {ami_ari['ami']}") +print(f"ARI: {ami_ari['ari']}") + +output_path = f"output/ami_ari.txt" +with open(output_path, 'w') as f: + f.write(f"AMI: {ami_ari['ami']}\n") + f.write(f"ARI: {ami_ari['ari']}\n") + +#end if + ]]></configfile> + </configfiles> + <inputs> + <expand macro="commercial_use_param"/> + <conditional name="utils_conditional"> + <param name="util" type="select" label="Flexynesis utils"> + <option value="louvain_clustering">Louvain Clustering</option> + <option value="get_optimal_clusters">Get Optimal Clusters</option> + <option value="k_means_clustering">K-Means Clustering</option> + <option value="compute_ami_ari">Compute AMI and ARI</option> + <option value="split_data">Split data to train and test</option> + <option value="binarize">Binarize mutation data</option> + </param> + <when value="louvain_clustering"> + <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> + <expand macro="plots_common_input"/> + <param argument="--threshold" type="float" min="0" optional="true" label="Distance threshold to create an edge between two nodes"/> + <param argument="--k" type="integer" min="0" optional="true" label="Number of nearest neighbors to connect for each node"/> + </when> + <when value="get_optimal_clusters"> + <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> + <expand macro="plots_common_input"/> + <param argument="--min_k" type="integer" min="0" value="2" optional="false" label="Minimum number of clusters to try"/> + <param argument="--max_k" type="integer" min="0" value="10" optional="false" label="Maximum number of clusters to try"/> + </when> + <when value="k_means_clustering"> + <param argument="--X" type="data" format="tabular,csv" label="Matrix" help="Input matrix, (samples, features)"/> + <expand macro="plots_common_input"/> + <param argument="--k" type="integer" min="0" optional="true" label="The number of clusters to form"/> + </when> + <when value="compute_ami_ari"> + <expand macro="plots_common_input"/> + <param name="true_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the true labels"/> + <param name="predicted_label" type="data_column" data_ref="labels" label="Column name in the labels file to use for the predicted labels"/> + </when> + <when value="split_data"> + <param argument="--clin" type="data" format="csv" optional="false" label="Clinical data" help="Samples in rows"/> + <param argument="--omics" type="data" format="tabular,csv" optional="false" multiple="true" label="Omics data" help="samples in columns"/> + <param argument="--split" type="float" min="0" max="1" value="0.7" label="Training/Test split ratio" help="Proportion of data to use for training (e.g., 0.7 means 70% train, 30% test)"/> + </when> + <when value="binarize"> + <param argument="--mutation" type="data" format="tabular,csv" label="Mutation data" help="Mutation data with both genes and samples in rows"/> + <param argument="--gene_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with genes"/> + <param argument="--sample_idx" type="data_column" data_ref="mutation" label="Column in the mutation file with samples"/> + </when> + </conditional> + </inputs> + <outputs> + <data name="util_out" auto_format="true" from_work_dir="output/*" label="${tool.name} on ${on_string}: ${utils_conditional.util}"> + <filter>utils_conditional['util'] != "split_data"</filter> + </data> + <collection name="train_out" type="list" label="${tool.name} on ${on_string}: train datasets"> + <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/train"/> + <filter>utils_conditional['util'] == "split_data"</filter> + </collection> + <collection name="test_out" type="list" label="${tool.name} on ${on_string}: test datasets"> + <discover_datasets pattern="__name_and_ext__" format="csv" directory="output/test"/> + <filter>utils_conditional['util'] == "split_data"</filter> + </collection> + </outputs> + <tests> + <!-- test 1: Louvain clustering --> + <test expect_num_outputs="1"> + <param name="non_commercial_use" value="True"/> + <conditional name="utils_conditional"> + <param name="util" value="louvain_clustering"/> + <param name="X" value="embeddings.csv"/> + <param name="labels" value="labels_pr.csv"/> + <param name="k" value="15"/> + </conditional> + <output name="util_out"> + <assert_contents> + <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,louvain_cluster"/> + <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,3.0"/> + </assert_contents> + </output> + </test> + <!-- test 2: Get optimal clusters --> + <test expect_num_outputs="1"> + <param name="non_commercial_use" value="True"/> + <conditional name="utils_conditional"> + <param name="util" value="get_optimal_clusters"/> + <param name="X" value="embeddings.csv"/> + <param name="labels" value="labels_pr.csv"/> + <param name="min_k" value="2"/> + <param name="max_k" value="10"/> + </conditional> + <assert_stdout> + <has_text text="Optimal number of clusters: 2"/> + <has_text text="Silhouette scores: "/> + </assert_stdout> + <output name="util_out"> + <assert_contents> + <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,optimal_kmeans_cluster"/> + <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> + </assert_contents> + </output> + </test> + <!-- test 3: K-Means clustering --> + <test expect_num_outputs="1"> + <param name="non_commercial_use" value="True"/> + <conditional name="utils_conditional"> + <param name="util" value="k_means_clustering"/> + <param name="X" value="embeddings.csv"/> + <param name="labels" value="labels_pr.csv"/> + <param name="k" value="2"/> + </conditional> + <assert_stdout> + <has_text text="KMeans(n_clusters=2, random_state=42)"/> + </assert_stdout> + <output name="util_out"> + <assert_contents> + <has_text text="sample_id,variable,class_label,probability,known_label,predicted_label,split,kmeans_cluster"/> + <has_text text="MB-4818,CLAUDIN_SUBTYPE,LumA,0.8582904,LumB,LumA,test,0.0"/> + </assert_contents> + </output> + </test> + <!-- test 4: Compute AMI and ARI --> + <test expect_num_outputs="1"> + <param name="non_commercial_use" value="True"/> + <conditional name="utils_conditional"> + <param name="util" value="compute_ami_ari"/> + <param name="labels" value="labels.csv"/> + <param name="true_label" value="5"/> + <param name="predicted_label" value="6"/> + </conditional> + <assert_stdout> + <has_text_matching expression="AMI: 0.5108[0-9]+"/> + <has_text_matching expression="ARI: 0.5258[0-9]+"/> + </assert_stdout> + <output name="util_out"> + <assert_contents> + <has_text_matching expression="AMI: 0.5108[0-9]+"/> + <has_text_matching expression="ARI: 0.5258[0-9]+"/> + </assert_contents> + </output> + </test> + <!-- test 5: Split data to train and test --> + <test expect_num_outputs="2"> + <param name="non_commercial_use" value="True"/> + <conditional name="utils_conditional"> + <param name="util" value="split_data"/> + <param name="clin" value="train/clin"/> + <param name="omics" value="train/cnv,train/gex"/> + <param name="split" value="0.7"/> + </conditional> + <output_collection name="train_out" type="list" count="3"> + <element name="clin"> + <assert_contents> + <has_text_matching expression="sample"/> + <has_n_lines n="645"/> + </assert_contents> + </element> + <element name="cnv"> + <assert_contents> + <has_text_matching expression="gene"/> + <has_n_lines n="25"/> + </assert_contents> + </element> + <element name="gex"> + <assert_contents> + <has_text_matching expression="gene"/> + <has_n_lines n="25"/> + </assert_contents> + </element> + </output_collection> + <output_collection name="test_out" type="list" count="3"> + <element name="clin"> + <assert_contents> + <has_text_matching expression="sample"/> + <has_n_lines n="277"/> + </assert_contents> + </element> + <element name="cnv"> + <assert_contents> + <has_text_matching expression="gene"/> + <has_n_lines n="25"/> + </assert_contents> + </element> + <element name="gex"> + <assert_contents> + <has_text_matching expression="gene"/> + <has_n_lines n="25"/> + </assert_contents> + </element> + </output_collection> + </test> + <!-- test 6: Binarize mutation data --> + <test expect_num_outputs="1"> + <param name="non_commercial_use" value="True"/> + <conditional name="utils_conditional"> + <param name="util" value="binarize"/> + <param name="mutation" value="mut.tabular"/> + <param name="gene_idx" value="1"/> + <param name="sample_idx" value="17"/> + </conditional> + <output name="util_out"> + <assert_contents> + <has_n_lines n="1611"/> + <has_text text="Hugo_Symbol"/> + <has_text text="AADACL2,0.0,0.0"/> + <has_text text="ABCB1,0.0,0.0,0.0,1.0"/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +@COMMON_HELP@ + +Flexynesis Utils provides a collection of clustering and evaluation utilities for multi-omics data analysis. This tool offers four main functionalities: + +1. **Louvain Clustering**: Community detection clustering algorithm that partitions data into clusters based on network modularity optimization +2. **Get Optimal Clusters**: Determines the optimal number of clusters using K-means clustering with silhouette score evaluation +3. **K-Means Clustering**: Standard K-means clustering algorithm for partitioning data into k clusters +4. **Compute AMI and ARI**: Calculates Adjusted Mutual Information (AMI) and Adjusted Rand Index (ARI) to evaluate clustering performance +5. **Split data to train and test**: Splits multi-omics data into training and testing sets based on a specified ratio +6. **Binarize mutation data**: Converts mutation data into a binary format, indicating presence or absence of mutations for each gene in each sample + +**Louvain Clustering** + Uses the Louvain algorithm for community detection in networks. This method builds a graph from the input data and finds communities (clusters) by optimizing modularity. + + Outputs Original labels file with added 'louvain_cluster' column + +**Get Optimal Clusters** + Performs K-means clustering for a range of k values and determines the optimal number of clusters using silhouette analysis. + + Outputs Original labels file with added 'optimal_kmeans_cluster' column + Console output shows the optimal k value and silhouette scores for each k. + +**K-Means Clustering** + Standard K-means clustering algorithm that partitions data into k clusters by minimizing within-cluster sum of squares. + + Outputs Original labels file with added 'kmeans_cluster' column + +**Compute AMI and ARI** + Evaluates clustering performance by comparing true labels with predicted labels using two standard metrics: + - *AMI (Adjusted Mutual Information)*: Measures mutual information between clusterings, adjusted for chance + - *ARI (Adjusted Rand Index)*: Measures similarity between clusterings, adjusted for chance + + Outputs Text file containing AMI and ARI scores + Both metrics range from 0 to 1, where 1 indicates perfect agreement + +**Search cBioPortal** + Fetches available data files from cBioPortal for a specified study ID. This allows users to explore and retrieve data from cBioPortal studies. + Outputs a text file listing available data files for the specified study ID + +**Split data to train and test** + Splits multi-omics data into training and testing sets based on a specified ratio. This is useful for preparing datasets for machine learning tasks. + You can use `Flexynesis cBioPortal import` to fetch data from cBioPortal and then use this tool to split the data into training and testing sets. + +**Binarize mutation data** + Converts mutation data into a binary format, indicating presence or absence of mutations for each gene in each sample. This is useful for preparing mutation data for analysis. + The output will be a tabular file with genes as rows and samples as columns, where each cell indicates whether a mutation is present (1) or absent (0). + + +.. _Documentation: https://bimsbstatic.mdc-berlin.de/akalin/buyar/flexynesis/site/ +.. _copyright holders: https://github.com/BIMSBbioinfo/flexynesis + ]]></help> + <expand macro="creator"/> + <expand macro="citations"/> +</tool> \ No newline at end of file