diff semibin.xml @ 3:8673617e7e09 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/semibin commit 2c08a2e49a2844efe92340c5a9e9c8323e4a33d6
author iuc
date Tue, 28 Oct 2025 08:15:27 +0000
parents 9de6b5e570df
children
line wrap: on
line diff
--- a/semibin.xml	Tue Mar 25 15:52:22 2025 +0000
+++ b/semibin.xml	Tue Oct 28 08:15:27 2025 +0000
@@ -10,7 +10,15 @@
     <expand macro="version"/>
     <command detect_errors="exit_code"><![CDATA[
 #import re
-@BAM_FILES@
+#if $mode.select != "single":
+    #if $mode.align_select.align_select == "bam":
+        @BAM_FILES@
+    #else:
+        @STROBEALIGN_FILES@
+    #end if
+#else:
+    @BAM_FILES@
+#end if
 @FASTA_FILES@
 SemiBin2
 #if $mode.select == 'single' or $mode.select == 'co'
@@ -37,8 +45,16 @@
         #end for
     #end if
 #end if
-    --input-fasta 'contigs.fasta'
-    --input-bam *.bam
+    --input-fasta 'contigs.$input_fasta.ext'
+    #if $mode.select == "single":
+        --input-bam *.bam
+    #else:
+        #if $mode.align_select.align_select == "bam":
+            --input-bam *.bam
+        #else:
+            -a *.txt
+        #end if
+    #end if
     --output 'output'
     --cannot-name 'cannot'
     @MIN_LEN@
@@ -68,18 +84,34 @@
             <expand macro="mode_select"/>
             <when value="single">
                 <expand macro="input-fasta-single"/>
-                <expand macro="input-bam-single"/>
+                    <expand macro="input-bam-single"/>
                 <expand macro="ref-single"/>
                 <expand macro="environment"/>
             </when>
             <when value="co">
                 <expand macro="input-fasta-single"/>
-                <expand macro="input-bam-multi"/>
+                <conditional name="align_select">
+                    <expand macro="bam_or_strobealign"/>
+                    <when value="bam">
+                        <expand macro="input-bam-multi"/>
+                    </when>
+                    <when value="txt">
+                        <expand macro="input-txt"/>
+                    </when>
+                </conditional>
                 <expand macro="ref-single"/>
             </when>
             <when value="multi">
                 <expand macro="input-fasta-multi"/>
-                <expand macro="input-bam-multi"/>
+                <conditional name="align_select">
+                    <expand macro="bam_or_strobealign"/>
+                    <when value="bam">
+                        <expand macro="input-bam-multi"/>
+                    </when>
+                    <when value="txt">
+                        <expand macro="input-txt"/>
+                    </when>
+                </conditional>
                 <expand macro="ref-multi"/>
             </when>
         </conditional>
@@ -107,42 +139,42 @@
     </inputs>
     <outputs>
         <collection name="output_pre_recluster_bins" type="list" label="${tool.name} on ${on_string}: Reconstructed bins before reclustering">
-            <filter>mode["select"]!="multi" and extra_output and "pre_reclustering_bins" in extra_output</filter>
+            <filter>mode['select']!="multi" and extra_output and "pre_reclustering_bins" in extra_output</filter>
             <discover_datasets pattern="(?P&lt;designation&gt;.*).fa" format="fasta" directory="output/output_prerecluster_bins"/>
         </collection>
         <collection name="output_after_recluster_bins" type="list" label="${tool.name} on ${on_string}: Reconstructed bins after reclustering">
-            <filter>mode["select"]!="multi" and extra_output and "pre_reclustering_bins" in extra_output</filter>
+            <filter>mode['select']!="multi" and extra_output and "pre_reclustering_bins" in extra_output</filter>
             <discover_datasets pattern="(?P&lt;designation&gt;.*).fa" format="fasta" directory="output/output_recluster_bins"/>
         </collection>
         <collection name="output_bins" type="list" label="${tool.name} on ${on_string}: Reconstructed bins">
-            <filter>mode["select"]!="multi" and not "pre_reclustering_bins" in extra_output</filter>
+            <filter>mode['select']!="multi" and not "pre_reclustering_bins" in extra_output</filter>
             <discover_datasets pattern="(?P&lt;designation&gt;.*).fa" format="fasta" directory="output/output_bins"/>
         </collection>
         <collection name="multi_bins" type="list" label="${tool.name} on ${on_string}: Reconstructed bins before reclustering (multi_bins)">
-            <filter>mode["select"]=="multi"</filter>
+            <filter>mode['select']=="multi"</filter>
             <discover_datasets pattern="(?P&lt;designation&gt;.*).fa" format="fasta" directory="output/bins"/>
         </collection>
         <data name="single_data" format="csv" from_work_dir="output/data.csv" label="${tool.name} on ${on_string}: Training data">
-            <filter>(mode["select"]=="single" or mode["select"]=="co") and extra_output and "data" in extra_output</filter>
+            <filter>(mode['select']=="single" or mode['select']=="co") and extra_output and "data" in extra_output</filter>
         </data>
         <data name="single_data_split" format="csv" from_work_dir="output/data_split.csv" label="${tool.name} on ${on_string}: Split training data">
-            <filter>(mode["select"]=="single" or mode["select"]=="co") and extra_output and "data" in extra_output</filter>
+            <filter>(mode['select']=="single" or mode['select']=="co") and extra_output and "data" in extra_output</filter>
         </data>
         <collection name="multi_data" type="list" label="${tool.name} on ${on_string}: Training data per sample">
-            <filter>mode["select"]=="multi" and extra_output and "data" in extra_output</filter>
+            <filter>mode['select']=="multi" and extra_output and "data" in extra_output</filter>
             <discover_datasets pattern="(?P&lt;designation&gt;.*)\/data.csv" format="csv" directory="output/samples/" recurse="true" match_relative_path="true"/>
         </collection>
         <collection name="multi_data_split" type="list" label="${tool.name} on ${on_string}: Split training data per sample">
-            <filter>mode["select"]=="multi" and extra_output and "data" in extra_output</filter>
+            <filter>mode['select']=="multi" and extra_output and "data" in extra_output</filter>
             <discover_datasets pattern="(?P&lt;designation&gt;.*)\/data_split.csv" format="csv" directory="output/samples/" recurse="true" match_relative_path="true"/>
         </collection>
-        <expand macro="generate_sequence_features_extra_outputs"/>
+        <expand macro="generate_sequence_features_extra_outputs_main"/>
     </outputs>
     <tests>
         <test expect_num_outputs="5">
             <conditional name="mode">
                 <param name="select" value="single"/>
-                <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
+                <param name="input_fasta" ftype="fasta.gz" value="input_single.fasta.gz"/>
                 <param name="input_bam" ftype="bam" value="input_single.bam"/>
                 <conditional name="ref">
                     <param name="select" value="taxonomy"/>
@@ -166,7 +198,6 @@
                 <param name="minfasta_kbs" value="200"/>
             </section>
             <param name="extra_output" value="data,coverage,contigs"/>
-            <output_collection name="output_bins" count="0"/>
             <output name="single_data" ftype="csv">
                 <assert_contents>
                     <has_text text="g1k_0"/>
@@ -194,7 +225,7 @@
         <test expect_num_outputs="5">
             <conditional name="mode">
                 <param name="select" value="single"/>
-                <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
+                <param name="input_fasta" ftype="fasta.bz2" value="input_single.fasta.bz2"/>
                 <param name="input_bam" ftype="bam" value="input_single.bam"/>
                 <conditional name="ref">
                     <param name="select" value="ml"/>
@@ -217,7 +248,6 @@
                 <param name="minfasta_kbs" value="200"/>
             </section>
             <param name="extra_output" value="data,coverage,contigs"/>
-            <output_collection name="output_bins" count="0"/>
             <output name="single_data" ftype="csv">
                 <assert_contents>
                     <has_text text="g1k_0"/>
@@ -246,7 +276,10 @@
             <conditional name="mode">
                 <param name="select" value="co"/>
                 <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
-                <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
+                <conditional name="align_select">
+                    <param name="align_select" value="bam"/>
+                    <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
+                </conditional>
                 <conditional name="ref">
                     <param name="select" value="ml"/>
                 </conditional>
@@ -267,8 +300,7 @@
                 <param name="minfasta_kbs" value="200"/>
             </section>
             <param name="extra_output" value="coverage"/>
-            <output_collection name="output_bins" count="0"/>
-            <output_collection name="co_cov" count="5">
+            <output_collection name="co_cov_bam" count="5">
                 <element name="0" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_0"/>
@@ -288,7 +320,7 @@
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="co_split_cov" count="5">
+            <output_collection name="co_split_cov_bam" count="5">
                 <element name="0" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_0_1"/>
@@ -313,7 +345,10 @@
             <conditional name="mode">
                 <param name="select" value="co"/>
                 <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
-                <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
+                <conditional name="align_select">
+                    <param name="align_select" value="bam"/>
+                    <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
+                </conditional>
                 <conditional name="ref">
                     <param name="select" value="taxonomy"/>
                     <param name="taxonomy_annotation_table" value="taxonomy.tsv"/>
@@ -335,8 +370,7 @@
                 <param name="minfasta_kbs" value="200"/>
             </section>
             <param name="extra_output" value="coverage"/>
-            <output_collection name="output_bins" count="0"/>
-            <output_collection name="co_cov" count="5">
+            <output_collection name="co_cov_bam" count="5">
                 <element name="0" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_0"/>
@@ -356,7 +390,7 @@
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="co_split_cov" count="5">
+            <output_collection name="co_split_cov_bam" count="5">
                 <element name="0" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_0_1"/>
@@ -381,7 +415,10 @@
             <conditional name="mode">
                 <param name="select" value="co"/>
                 <param name="input_fasta" ftype="fasta" value="input_single.fasta"/>
-                <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
+                <conditional name="align_select">
+                    <param name="align_select" value="bam"/>
+                    <param name="input_bam" ftype="bam" value="input_coassembly_sorted1.bam,input_coassembly_sorted2.bam,input_coassembly_sorted3.bam,input_coassembly_sorted4.bam,input_coassembly_sorted5.bam"/>
+                </conditional>
                 <conditional name="ref">
                     <param name="select" value="taxonomy"/>
                     <param name="taxonomy_annotation_table" value="taxonomy.tsv"/>
@@ -391,8 +428,11 @@
                 <param name="method" value="ratio"/>
                 <param name="ratio" value="0.05"/>
             </conditional>
-            <param name="orf_finder" value="fraggenescan"/>
+            <param name="orf_finder" value="fast-naive"/>
             <param name="random_seed" value="0"/>
+            <section name="annot">
+                <param name="ml_threshold" value="0"/>
+            </section>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -403,8 +443,7 @@
                 <param name="minfasta_kbs" value="200"/>
             </section>
             <param name="extra_output" value="coverage"/>
-            <output_collection name="output_bins" count="0"/>
-            <output_collection name="co_cov" count="5">
+            <output_collection name="co_cov_bam" count="5">
                 <element name="0" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_0"/>
@@ -424,7 +463,7 @@
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="co_split_cov" count="5">
+            <output_collection name="co_split_cov_bam" count="5">
                 <element name="0" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_0_1"/>
@@ -459,8 +498,11 @@
                 <param name="method" value="ratio"/>
                 <param name="ratio" value="0.05"/>
             </conditional>
-            <param name="orf_finder" value="fraggenescan"/>
+            <param name="orf_finder" value="fast-naive"/>
             <param name="random_seed" value="0"/>
+            <section name="annot">
+                <param name="ml_threshold" value="0"/>
+            </section>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -470,13 +512,6 @@
                 <param name="max_edges" value="200"/>
                 <param name="minfasta_kbs" value="200"/>
             </section>
-            <output_collection name="output_bins" count="1">
-                <element name="SemiBin_30" ftype="fasta">
-                    <assert_contents>
-                        <has_text text=">g3k_0"/>
-                    </assert_contents>
-                </element>
-            </output_collection>
         </test>
         <test expect_num_outputs="2">
             <conditional name="mode">
@@ -492,8 +527,11 @@
                 <param name="method" value="ratio"/>
                 <param name="ratio" value="0.05"/>
             </conditional>
-            <param name="orf_finder" value="fraggenescan"/>
+            <param name="orf_finder" value="fast-naive"/>
             <param name="random_seed" value="0"/>
+            <section name="annot">
+                <param name="ml_threshold" value="0"/>
+            </section>
             <section name="training">
                 <param name="epoches" value="20"/>
                 <param name="batch_size" value="2048"/>
@@ -504,7 +542,7 @@
                 <param name="minfasta_kbs" value="200"/>
             </section>
             <param name="extra_output" value="pre_reclustering_bins"/>
-            <output_collection name="output_pre_recluster_bins" count="3">
+            <output_collection name="output_pre_recluster_bins">
                 <element name="SemiBin_0" ftype="fasta">
                     <assert_contents>    
                         <has_text text="g1k_0"/>
@@ -536,7 +574,10 @@
                     <param name="select" value="concatenated"/>
                     <param name="input_fasta" ftype="fasta" value="input_multi.fasta.gz"/>
                 </conditional>
-                <param name="input_bam" ftype="bam" value="input_multi_sorted1.bam,input_multi_sorted2.bam,input_multi_sorted3.bam,input_multi_sorted4.bam,input_multi_sorted5.bam,input_multi_sorted6.bam,input_multi_sorted7.bam,input_multi_sorted8.bam,input_multi_sorted9.bam,input_multi_sorted10.bam"/>
+                <conditional name="align_select">
+                    <param name="align_select" value="bam"/>
+                    <param name="input_bam" ftype="bam" value="input_multi_sorted1.bam,input_multi_sorted2.bam,input_multi_sorted3.bam,input_multi_sorted4.bam,input_multi_sorted5.bam,input_multi_sorted6.bam,input_multi_sorted7.bam,input_multi_sorted8.bam,input_multi_sorted9.bam,input_multi_sorted10.bam"/>
+                </conditional>
                 <conditional name="ref">
                     <param name="select" value="taxonomy"/>
                     <param name="taxonomy_annotation_table" value="taxonomy.tsv,taxonomy_2.tsv,taxonomy_3.tsv,taxonomy_4.tsv,taxonomy_5.tsv,taxonomy_6.tsv,taxonomy_7.tsv,taxonomy_8.tsv,taxonomy_9.tsv,taxonomy_10.tsv"/>
@@ -546,7 +587,7 @@
                 <param name="method" value="ratio"/>
                 <param name="ratio" value="0.05"/>
             </conditional>
-            <param name="orf_finder" value="fraggenescan"/>
+            <param name="orf_finder" value="fast-naive"/>
             <param name="random_seed" value="0"/>
             <section name="training">
                 <param name="epoches" value="20"/>
@@ -573,29 +614,29 @@
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="multi_cov" count="10">
+            <output_collection name="multi_cov_bam" count="10">
                 <element name="8" ftype="csv">
                     <assert_contents>
                         <has_text text="S1:g1k_5,"/>
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="multi_cov_sample" count="10">
+            <output_collection name="multi_cov_sample_bam" count="10">
                 <element name="S8" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_3"/>
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="multi_split_cov" count="10">
+            <output_collection name="multi_split_cov_bam" count="10">
                 <element name="8" ftype="csv">
                     <assert_contents>
                         <has_text text="S1:g1k_5_1,0."/>
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="multi_split_cov_sample" count="10">
-                <element name="S8" ftype="csv">
+            <output_collection name="multi_split_cov_sample_bam" count="10">
+                <element name="8" ftype="csv">
                     <assert_contents>
                         <has_text text="g1k_3_1"/>
                     </assert_contents>
@@ -609,8 +650,116 @@
                 </element>
             </output_collection>
         </test>
+        <test expect_num_outputs="6">
+            <conditional name="mode">
+                <param name="select" value="co"/>
+                <param name="input_fasta" ftype="fasta" value="input_multi.fasta.gz"/>
+                <conditional name="align_select">
+                    <param name="align_select" value="txt"/>
+                    <param name="abundance" ftype="txt" value="strobealign_1.txt,strobealign_2.txt,strobealign_3.txt,strobealign_4.txt,strobealign_5.txt"/>
+                </conditional>
+                <conditional name="ref">
+                    <param name="select" value="taxonomy"/>
+                    <param name="taxonomy_annotation_table" value="taxonomy.tsv"/>
+                </conditional>
+            </conditional>
+            <conditional name="min_len">
+                <param name="method" value="ratio"/>
+                <param name="ratio" value="0.05"/>
+            </conditional>
+            <param name="orf_finder" value="fast-naive"/>
+            <param name="random_seed" value="0"/>
+            <section name="annot">
+                <param name="ml_threshold" value="0"/>
+            </section>
+            <section name="training">
+                <param name="epoches" value="20"/>
+                <param name="batch_size" value="2048"/>
+            </section>
+            <section name="bin">
+                <param name="max_node" value="0.15"/>
+                <param name="max_edges" value="20"/>
+                <param name="minfasta_kbs" value="20"/>
+            </section>
+            <param name="extra_output" value="data,coverage,contigs,pre_reclustering_bins"/>
+        </test> 
+        <test expect_num_outputs="6">
+            <conditional name="mode">
+                <param name="select" value="multi"/>
+                <conditional name="multi_fasta">
+                    <param name="select" value="concatenated"/>
+                    <param name="input_fasta" ftype="fasta" value="input_multi.fasta.gz"/>
+                </conditional>
+                <conditional name="align_select">
+                    <param name="align_select" value="txt"/>
+                    <param name="abundance" ftype="txt" value="strobealign_1.txt,strobealign_2.txt,strobealign_3.txt,strobealign_4.txt,strobealign_5.txt"/>
+                </conditional>
+                <conditional name="ref">
+                    <param name="select" value="taxonomy"/>
+                    <param name="taxonomy_annotation_table" value="taxonomy.tsv,taxonomy_2.tsv,taxonomy_3.tsv,taxonomy_4.tsv,taxonomy_5.tsv,taxonomy_6.tsv,taxonomy_7.tsv,taxonomy_8.tsv,taxonomy_9.tsv,taxonomy_10.tsv"/>
+                </conditional>
+            </conditional>
+            <conditional name="min_len">
+                <param name="method" value="ratio"/>
+                <param name="ratio" value="0.05"/>
+            </conditional>
+            <param name="orf_finder" value="fast-naive"/>
+            <param name="random_seed" value="0"/>
+            <section name="annot">
+                <param name="ml_threshold" value="0"/>
+            </section>
+            <section name="training">
+                <param name="epoches" value="20"/>
+                <param name="batch_size" value="2048"/>
+            </section>
+            <section name="bin">
+                <param name="max_node" value="0.15"/>
+                <param name="max_edges" value="30"/>
+                <param name="minfasta_kbs" value="30"/>
+            </section>
+            <param name="extra_output" value="data,coverage,contigs"/>
+            <output_collection name="multi_bins" count="10"/>
+            <output_collection name="multi_data" count="10">
+                <element name="S8" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0,"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="multi_data_split" count="10">
+                <element name="S8" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_0_1,"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="multi_cov_txt" count="10">
+                <element name="S8" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_5,"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="multi_split_cov_txt" count="10">
+                <element name="S8" ftype="csv">
+                    <assert_contents>
+                        <has_text text="g1k_5_1,1."/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="multi_contigs" count="10">
+                <element name="S8" ftype="fasta">
+                    <assert_contents>
+                        <has_text text=">g1k_0"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+        </test>        
     </tests>
     <help><![CDATA[
+**Please note that there is a known issue with Semibin2 where results may be inconsistent across runs on different, despite a set seed. This may cause issues with reproducibility.**
+For more information, see this [issue]{https://github.com/BigDataBiology/SemiBin/issues/186} on their repository: https://github.com/BigDataBiology/SemiBin/issues/186
+
 @HELP_HEADER@
 
 Inputs