Mercurial > repos > bgruening > flexynesis_plot
diff macros.xml @ 0:bb91bf19eb40 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/flexynesis commit b2463fb68d0ae54864d87718ee72f5e063aa4587
author | bgruening |
---|---|
date | Tue, 24 Jun 2025 05:55:50 +0000 |
parents | |
children | 262c2132b392 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Tue Jun 24 05:55:50 2025 +0000 @@ -0,0 +1,319 @@ +<macros> + <token name="@TOOL_VERSION@">0.2.20</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">24.1</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">flexynesis</requirement> + <yield/> + </requirements> + </xml> + <xml name="edam"> + <edam_topics> + <edam_topic>topic_0622</edam_topic> + <edam_topic>topic_3474</edam_topic> + <edam_topic>topic_2640</edam_topic> + </edam_topics> + <edam_operations> + <edam_operation>operation_3197</edam_operation> + <edam_operation>operation_2403</edam_operation> + <edam_operation>operation_2426</edam_operation> + </edam_operations> + </xml> + <xml name="sanitizer_printable"> + <sanitizer invalid_char=""> + <valid initial="string.printable"> + <remove value="'"/> + <remove value='"'/> + <remove value=" "/> + <yield/> + </valid> + </sanitizer> + </xml> + <xml name="sanitizer_letters"> + <sanitizer invalid_char=" "> + <valid initial="string.letters"> + <add value="_"/> + </valid> + </sanitizer> + </xml> + <token name="@CHECK_NON_COMMERCIAL_USE@"><![CDATA[ + #if not $non_commercial_use + >&2 echo "this tool is only available for non commercial use"; + exit 1; + #end if + ]]></token> + <xml name="commercial_use_param"> + <param name="non_commercial_use" label="I certify that I am not using this tool for commercial purposes." type="boolean" truevalue="NON_COMMERCIAL_USE" falsevalue="COMMERCIAL_USE" checked="False"> + <validator type="expression" message="This tool is only available for non-commercial use.">value == True</validator> + </param> + </xml> + <xml name="main_inputs"> + <param name="train_clin" type="data" format="csv" label="Training clinical data"/> + <param name="test_clin" type="data" format="csv" label="Test clinical data"/> + <param name="train_omics_main" type="data" format="csv" label="Training omics data"/> + <param name="test_omics_main" type="data" format="csv" label="Test omics data"/> + <param name="assay_main" type="text" optional="true" label="What type of assay is your input?" help="This would be used as output name."> + <expand macro="sanitizer_letters"/> + </param> + </xml> + <xml name="extra_inputs"> + <param name="train_omics" type="data" optional="true" format="csv" label="Training omics data"/> + <param name="test_omics" type="data" optional="true" format="csv" label="Test omics data"/> + <param name="assay" type="text" optional="true" label="What type of assay is your input?" help="This would be used as output name." > + <expand macro="sanitizer_letters"/> + </param> + </xml> + <xml name="advanced"> + <section name="advanced" title="Advanced Options"> + <param argument="--fusion_type" type="select" label="Fusion method" help="How to fuse the omics layers?"> + <option value="intermediate">intermediate</option> + <option value="early">early</option> + </param> + <param argument="--finetuning_samples" type="integer" min="0" value="0" label="Number of samples from the test dataset to use for fine-tuning the model." help="Set to 0 to disable fine-tuning."/> + <param argument="--variance_threshold" type="float" min="0" max="100" value="1" label="Variance threshold (as percentile) to drop low variance features." help="Set to 0 for no variance filtering."/> + <param argument="--correlation_threshold" type="float" min="0" max="1" value="0.8" label="Correlation threshold to drop highly redundant features." help="Set to 1 for no redundancy filtering."/> + <param argument="--subsample" type="integer" min="0" value="0" label="Downsample training set to randomly drawn N samples for training."/> + <param argument="--features_min" type="integer" min="0" value="500" label="Minimum number of features to retain after feature selection."/> + <param argument="--features_top_percentile" type="float" min="0" max="100" value="20" label="Top percentile features (among the features remaining after variance filtering and data cleanup) to retain after feature selection."/> + <param argument="--log_transform" type="boolean" truevalue="--log_transform True" falsevalue="" checked="false" label="Whether to apply log-transformation to input data matrices"/> + <param argument="--early_stop_patience" type="integer" min="-1" value="10" label="How many epochs to wait when no improvements in validation loss are observed." help="Set to -1 to disable early stopping."/> + <param argument="--hpo_iter" type="integer" min="1" value="100" label="Number of iterations for hyperparameter optimization."/> + <param argument="--val_size" type="float" min="0.0" max="1" value="0.2" label="Proportion of training data to be used as validation split"/> + <param argument="--hpo_patience" type="integer" min="0" value="10" label="How many hyperparameter optimization iterations to wait for when no improvements are observed." help="Set to 0 to disable early stopping."/> + <param argument="--use_cv" type="boolean" truevalue="--use_cv" falsevalue="" checked="false" label="Cross validation" help="If set, a 5-fold cross-validation training will be done. Otherwise, a single training on 80 percent of the dataset is done. "/> + <param argument="--use_loss_weighting" type="boolean" truevalue="--use_loss_weighting True" falsevalue="" checked="true" label="Whether to apply loss-balancing using uncertainty weights method."/> + <param argument="--evaluate_baseline_performance" type="boolean" truevalue="--evaluate_baseline_performance" falsevalue="" checked="false" label="Enable modeling also with Random Forest + SVMs to see the performance of off-the-shelf tools on the same dataset."/> + <param argument="--feature_importance_method" type="select" label="which method(s) to use to compute feature importance scores."> + <option value="Both" selected="true">Both</option> + <option value="IntegratedGradients">IntegratedGradients</option> + <option value="GradientShap">GradientShap</option> + </param> + </section> + </xml> + <xml name="plots_common_param"> + <yield/> + <param name="format" type="select" label="Output format"> + <option value="jpg" selected="true">jpg</option> + <option value="png">png</option> + <option value="pdf">pdf</option> + <option value="svg">svg</option> + </param> + <param name="dpi" type="integer" min="0" max="1200" value="300" label="DPI"/> + </xml> + <xml name="plots_common_input"> + <yield/> + <param argument="--labels" type="data" format="tabular,csv" label="Predicted labels" help="Generated by flexynesis"/> + </xml> + <token name="@PLOT_COMMON_CONFIG@"><![CDATA[ +label_data = load_labels('inputs/$plot_conditional.labels.element_identifier.$plot_conditional.labels.ext') + ]]></token> + <token name="@PR_ROC_BOX_CONFIG@"><![CDATA[ +@PLOT_COMMON_CONFIG@ + +# Check if this is a regression problem (no class probabilities) +non_na_probs = label_data['probability'].notna().sum() + +print(f" Non-NaN probabilities: {non_na_probs}/{len(label_data)}") + +# If most probabilities are NaN, this is likely a regression problem +if non_na_probs < len(label_data) * 0.1: # Less than 10% valid probabilities + raise ValueError(" Detected regression problem - precision-recall curves not applicable") + +# Debug: Check data quality +total_rows = len(label_data) +missing_labels = label_data['known_label'].isna().sum() +missing_probs = label_data['probability'].isna().sum() +unique_samples = label_data['sample_id'].nunique() +unique_classes = label_data['class_label'].nunique() + +print(f" Data summary: {total_rows} total rows, {unique_samples} unique samples, {unique_classes} unique classes") +print(f" Missing data: {missing_labels} missing known_label, {missing_probs} missing probability") + +if missing_labels > 0: + print(f" Warning: Found {missing_labels} missing known_label values") + missing_samples = label_data[label_data['known_label'].isna()]['sample_id'].unique()[:5] + print(f" Sample IDs with missing known_label: {list(missing_samples)}") + + # Remove rows with missing known_label + label_data = label_data.dropna(subset=['known_label']) + if label_data.empty: + raise ValueError("Error: No valid known_label data remaining") + + ]]></token> + <token name="@PR_ROC_CONFIG@"><![CDATA[ +@PR_ROC_BOX_CONFIG@ + +# 1. Pivot to wide format +prob_df = label_data.pivot(index='sample_id', columns='class_label', values='probability') + +print(f" After pivot: {prob_df.shape[0]} samples x {prob_df.shape[1]} classes") +print(f" Class columns: {list(prob_df.columns)}") + +# Check for NaN values in probability data +nan_counts = prob_df.isna().sum() +if nan_counts.any(): + print(f" NaN counts per class: {dict(nan_counts)}") + print(f" Samples with any NaN: {prob_df.isna().any(axis=1).sum()}/{len(prob_df)}") + + # Drop only rows where ALL probabilities are NaN + all_nan_rows = prob_df.isna().all(axis=1) + if all_nan_rows.any(): + print(f" Dropping {all_nan_rows.sum()} samples with all NaN probabilities") + prob_df = prob_df[~all_nan_rows] + + remaining_nans = prob_df.isna().sum().sum() + if remaining_nans > 0: + print(f" Warning: {remaining_nans} individual NaN values remain - filling with 0") + prob_df = prob_df.fillna(0) + + if prob_df.empty: + raise ValueError(f" Error: No valid probability data remaining for") + +# 2. Get true labels +true_labels_df = label_data.drop_duplicates('sample_id')[['sample_id', 'known_label']].set_index('sample_id') + +# 3. Align indices - only keep samples that exist in both datasets +common_indices = prob_df.index.intersection(true_labels_df.index) +if len(common_indices) == 0: + raise ValueError(f" Error: No common sample_ids between probability and true label data") + +print(f" Found {len(common_indices)} samples with both probability and true label data") + +# Filter both datasets to common indices +prob_df_aligned = prob_df.loc[common_indices] +y_true = true_labels_df.loc[common_indices]['known_label'] + +# 4. Final check for NaN values +if y_true.isna().any(): + raise ValueError(f" Error: True labels still contain NaN after alignment") + +if prob_df_aligned.isna().any().any(): + raise ValueError(f" Error: Probability data still contains NaN after alignment") + +# 5. Convert categorical labels to integer labels +# Create a mapping from class names to integers +class_names = list(prob_df_aligned.columns) +class_to_int = {class_name: i for i, class_name in enumerate(class_names)} + +print(f" Class mapping: {class_to_int}") + +# Convert true labels to integers +y_true_np = y_true.map(class_to_int).to_numpy() +y_probs_np = prob_df_aligned.to_numpy() + +print(f" Data shape: y_true={y_true_np.shape}, y_probs={y_probs_np.shape}") +print(f" Unique true labels (integers): {set(y_true_np)}") +print(f" Class labels (columns): {class_names}") +print(f" Label distribution: {dict(zip(*np.unique(y_true_np, return_counts=True)))}") + +# Check for any unmapped labels (will be NaN) +if pd.isna(y_true_np).any(): + raise ValueError(" Error: Some true labels could not be mapped to class columns") + + ]]></token> + <xml name="common_test"> + <param name="non_commercial_use" value="True"/> + <conditional name="training_type"> + <param name="model" value="s_train"/> + <param name="train_clin" value="train/clin" ftype="csv"/> + <param name="test_clin" value="test/clin" ftype="csv"/> + <param name="train_omics_main" value="train/gex" ftype="csv"/> + <param name="test_omics_main" value="test/gex" ftype="csv"/> + <param name="assay_main" value="bar"/> + <repeat name="omics"> + <param name="train_omics" value="train/cnv" ftype="csv"/> + <param name="test_omics" value="test/cnv" ftype="csv"/> + <param name="assay" value="foo"/> + </repeat> + <conditional name="model_class"> + <param name="model_class" value="DirectPred"/> + </conditional> + <param name="target_variables" value="Erlotinib"/> + <param name="surv_event_var" value="OS_STATUS"/> + <param name="surv_time_var" value="OS_MONTHS"/> + <section name="advanced"> + <param name="hpo_iter" value="1"/> + </section> + </conditional> + <yield/> + <output_collection name="results" type="list"> + <element name="job.embeddings_test"> + <assert_contents> + <has_n_lines n="50"/> + </assert_contents> + </element> + <element name="job.embeddings_train"> + <assert_contents> + <has_n_lines n="50"/> + </assert_contents> + </element> + <element name="job.feature_importance.GradientShap"> + <assert_contents> + <has_text_matching expression="Erlotinib,0,,bar,A2M,"/> + <has_text_matching expression="Erlotinib,0,,bar,ABCC4,"/> + <has_text_matching expression="GradientShap"/> + </assert_contents> + </element> + <element name="job.feature_importance.IntegratedGradients"> + <assert_contents> + <has_text_matching expression="Erlotinib,0,,bar,A2M,"/> + <has_text_matching expression="Erlotinib,0,,bar,ABCC4,"/> + <has_text_matching expression="IntegratedGradients"/> + </assert_contents> + </element> + <element name="job.feature_logs.bar"> + <assert_contents> + <has_n_lines n="25"/> + </assert_contents> + </element> + <element name="job.feature_logs.omics_foo"> + <assert_contents> + <has_n_lines n="25"/> + </assert_contents> + </element> + <element name="job.predicted_labels"> + <assert_contents> + <has_text_matching expression="source_dataset:A-704,Erlotinib,"/> + <has_text_matching expression="target_dataset:KMRC-20,Erlotinib,"/> + </assert_contents> + </element> + <element name="job.stats"> + <assert_contents> + <has_text_matching expression="DirectPred,Erlotinib,numerical,mse,"/> + <has_text_matching expression="DirectPred,Erlotinib,numerical,r2,"/> + <has_text_matching expression="DirectPred,Erlotinib,numerical,pearson_corr,"/> + </assert_contents> + </element> + </output_collection> + </xml> + <token name="@COMMON_HELP@"> + +.. class:: warningmark + +**WARNING: This tool is only available for NON-COMMERCIAL use. Permission is only granted for academic, research, and educational purposes. Before using, be sure to review, agree, and comply with the license.** + +Flexynesis is a deep-learning based multi-omics bulk sequencing data integration suite with a focus on (pre-)clinical endpoint prediction. +The package includes multiple types of deep learning architectures such as simple fully connected networks, supervised variational autoencoders, graph convolutional networks, multi-triplet networks different options of data layer fusion, and automates feature selection and hyperparameter optimization. + +For more information, please check the Documentation_ : + +For commercial use, please review the flexynesis license on GitHub and contact the `copyright holders`_ . + +----- + + </token> + <xml name="creator"> + <creator> + <organization name="European Galaxy Team" url="https://galaxyproject.org/eu/"/> + <person givenName="Amirhossein" familyName="Naghsh Nilchi" email="nilchia@informatik.uni-freiburg.de"/> + <yield/> + <person givenName="Björn" familyName="Grüning" email="gruening@informatik.uni-freiburg.de"/> + </creator> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1101/2024.07.16.603606</citation> + </citations> + </xml> +</macros>