diff metaphlan.xml @ 8:1416b7c401a3 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/metaphlan/ commit 671a5fc6d4c02bd3eb830c1886a31ecffd134ceb
author iuc
date Sun, 11 Aug 2024 20:34:37 +0000
parents 11136e6b78f2
children f0ca613c512a
line wrap: on
line diff
--- a/metaphlan.xml	Thu Apr 20 11:25:18 2023 +0000
+++ b/metaphlan.xml	Sun Aug 11 20:34:37 2024 +0000
@@ -1,4 +1,4 @@
-<tool id="metaphlan" name="MetaPhlAn" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+<tool id="metaphlan" name="MetaPhlAn" version="@TOOL_VERSION@+galaxy1" profile="@PROFILE@">
     <description>to profile the composition of microbial communities</description>
     <macros>
         <import>macros.xml</import>
@@ -15,8 +15,7 @@
                     <option value="s">Species only</option>
                 </param>
                 <when value="a">
-                    <param name="split_levels" type='boolean' checked="false" truevalue='true' falsevalue='false' 
-                        label="Generate a report for each taxonomic level?" help="It will be in addition to the default output"/>
+                    <param name="split_levels" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate a report for each taxonomic level?" help="It will be in addition to the default output"/>
                 </when>
                 <when value="k"/>
                 <when value="p"/>
@@ -38,14 +37,12 @@
         #set full_ext=$inputs.in.raw_in.in.datatype.file_ext
         #if $full_ext.endswith("gz")
             #set $file_path="in"     
-zcat '$inputs.in.raw_in.in' > '$file_path'
-&&
+            zcat '$inputs.in.raw_in.in' > '$file_path' &&
         #else if $full_ext.endswith("bz2")
             #set $file_path="in"
-bzcat '$inputs.in.raw_in.in' > '$file_path'
-&&
+            bzcat '$inputs.in.raw_in.in' > '$file_path' &&
         #else
-            #set $file_path=$inputs.in.raw_in.in
+            #set $file_path="'%s'" % $inputs.in.raw_in.in
         #end if
     #else if $inputs.in.raw_in.selector == "multiple"
         #set full_ext=$inputs.in.raw_in.in[0].datatype.file_ext
@@ -53,45 +50,54 @@
         #set sep=""
         #for $i, $f in enumerate($inputs.in.raw_in.in)
             #if $f.datatype.file_ext != $full_ext
-echo "Different datatypes for input files"
-&&
-exit 1
+            echo "Different datatypes for input files" &&
+            exit 1
             #end if          
             #if $full_ext.endswith("gz")
                 #set fp="input_%s" % ($i)
-zcat '$f' > '$fp'
-&&
+                zcat '$f' > '$fp' &&
             #else if $full_ext.endswith("bz2")
                 #set fp="input_%s" % ($i)
-bzcat '$f' > '$fp'
-&&
+                bzcat '$f' > '$fp' &&
             #else
                 #set fp=$f
             #end if
-            #set $file_path+="%s%s" % ($sep, $fp)
+            #set $file_path+="'%s%s'" % ($sep, $fp)
             #set $sep=","
         #end for
     #else if $inputs.in.raw_in.selector == "paired"
         #set full_ext=$inputs.in.raw_in.in_f.datatype.file_ext
         #if $full_ext != $inputs.in.raw_in.in_r.datatype.file_ext
-echo "Different datatypes for input paired-end files"
-&&
-exit 1
+            echo "Different datatypes for input paired-end files" &&
+            exit 1
         #end if
         #if $full_ext.endswith("gz")
-zcat '$inputs.in.raw_in.in_f' > 'in_f'
-&&
-zcat '$inputs.in.raw_in.in_r' > 'in_r'
-&&
-            #set file_path="in_f,in_r"
+            zcat '$inputs.in.raw_in.in_f' > 'in_f' &&
+            zcat '$inputs.in.raw_in.in_r' > 'in_r' &&
+            #set file_path="-1 in_f -2 in_r"
         #else if $full_ext.endswith("bz2")
-bzcat '$inputs.in.raw_in.in_f' > 'in_f'
-&&
-bzcat '$inputs.in.raw_in.in_r' > 'in_r'
-&&
-            #set file_path="in_f,in_r"
+            bzcat '$inputs.in.raw_in.in_f' > 'in_f' && 
+            bzcat '$inputs.in.raw_in.in_r' > 'in_r' &&
+            #set file_path="-1 in_f -2 in_r"
         #else
-            #set file_path="%s,%s" % ($inputs.in.raw_in.in_f,$inputs.in.raw_in.in_r)
+            #set file_path="-1 '%s' -2 '%s'" % ($inputs.in.raw_in.in_f,$inputs.in.raw_in.in_r)
+        #end if
+    #else if $inputs.in.raw_in.selector == "paired_collection"
+        #set full_ext=$inputs.in.raw_in.in.forward.ext
+        #if $full_ext != $inputs.in.raw_in.in.reverse.ext
+            echo "Different datatypes for input paired-end files" &&
+            exit 1
+        #end if
+        #if $full_ext.endswith("gz")
+            zcat '$inputs.in.raw_in.in.forward' > 'in_f' &&
+            zcat '$inputs.in.raw_in.in.reverse' > 'in_r' &&
+            #set file_path="-1 in_f -2 in_r"
+        #else if $full_ext.endswith("bz2")
+            bzcat '$inputs.in.raw_in.in.forward' > 'in_f' && 
+            bzcat '$inputs.in.raw_in.in.reverse' > 'in_r' &&
+            #set file_path="-1 in_f -2 in_r"
+        #else
+            #set file_path="-1 '%s' -2 '%s'" % ($inputs.in.raw_in.in_f,$inputs.in.raw_in.in_r)
         #end if
     #end if
 
@@ -105,20 +111,18 @@
 #end if
 
 #if $inputs.db.db_selector == "history"
-mkdir 'ref_db'
-&&
-bowtie2-build --large-index '$inputs.db.bowtie2db' 'ref_db/custom_db'
-&&
-python '$__tool_directory__/customizemetadata.py'
+mkdir 'ref_db' &&
+bowtie2-build --large-index '$inputs.db.bowtie2db' 'ref_db/custom_db' &&
+python
+    '$__tool_directory__/customizemetadata.py'
     transform_json_to_pkl
     --json '$inputs.db.mpa_pkl'
-    --pkl 'ref_db/custom_db.pkl'
-&&
+    --pkl 'ref_db/custom_db.pkl' &&
 #end if
 
 metaphlan
 #if $inputs.in.selector == "raw"
-    '$file_path'
+    $file_path
     --input_type '$ext'
     --read_min_len $inputs.in.read_min_len
     --bt2_ps '$inputs.in.mapping.bt2_ps'
@@ -180,12 +184,34 @@
     $out.use_group_representative
     $out.legacy_output
     $out.CAMI_format_output
-    $out.unknown_estimation
+    $out.unclassified_estimation
     -o '$output_file'
     --bowtie2out 'bowtie2out'
     -s '$sam_output_file'
     --biom '$biom_output_file'
     --nproc \${GALAXY_SLOTS:-4}
+#if $viral_analysis.profile_vsc
+    $viral_analysis.profile_vsc
+    --vsc_out '$vcs_breath_coverage'
+    --vsc_breadth $viral_analysis.vsc_breadth
+#end if
+
+#if $subsample.selector != "no"
+    #if $subsample.selector == "single"
+        --subsampling $subsample.subsampling
+    #else
+        --subsampling_paired $subsample.subsampling_paired
+    #end if
+    $subsample.mapping_subsampling
+    #if $subsample.subsampling_seed
+        --subsampling_seed $subsample.subsampling_seed
+    #end if
+    --subsampling_output subsampled.out
+#end if
+
+#if $test == "false"
+    --offline
+#end if
 
 #if $inputs.in.selector == "raw"
 &&
@@ -227,13 +253,17 @@
                         <param name="selector" type="select" label="Fasta/FastQ file(s) with microbiota reads">
                             <option value="single" selected="true">One single-end file</option>
                             <option value="multiple">Multiple single-end files</option>
+                            <option value="paired_collection">Paired-end collection</option>
                             <option value="paired">Paired-end files</option>
                         </param>
                         <when value="single">
                             <param name="in" type="data" format="@FILE_FORMATS@" label="Single-end Fasta/FastQ file with microbiota reads"/>
                         </when>
                         <when value="multiple">
-                            <param name="in" type="data" format="@FILE_FORMATS@" multiple="true" label="Single-end Fasta/FastQ files with microbiota reads"/>
+                            <param name="in" type="data" format="@FILE_FORMATS@" label="Single-end Fasta/FastQ files with microbiota reads" multiple="true"/>
+                        </when>
+                        <when value="paired_collection">
+                            <param name="in" type="data_collection" format="@FILE_FORMATS@" label="Paired-end Fasta/FastQ file with microbiota reads" collection_type="paired"/>
                         </when>
                         <when value="paired">
                             <param name="in_f" type="data" format="@FILE_FORMATS@" label="Forward paired-end Fasta/FastQ file with microbiota reads"/>
@@ -255,8 +285,7 @@
                     <param name="in" type="data" format="sam" label="Externally BowTie2-mapped SAM file" help="BowTie2 needs to be used first to map microbiota reads"/>
                 </when>
                 <when value="bowtie2out">
-                    <param name="in" type="data" format="tabular" label="Intermediary mapping file of the microbiota generated by a previous MetaPhlAn run" 
-                        help="File needs to be generated with MetaPhlAn versions >3.0"/>
+                    <param name="in" type="data" format="tabular" label="Intermediary mapping file of the microbiota generated by a previous MetaPhlAn run" help="File needs to be generated with MetaPhlAn versions &gt;3.0"/>
                 </when>
             </conditional>
             <conditional name="db">
@@ -286,7 +315,7 @@
                     <option value="reads_map">reads_map: Mapping from reads to clades (only reads hitting a marker)</option>
                     <option value="clade_profiles">clade_profiles: Normalized marker counts for clades with at least a non-null marker</option>
                     <option value="clade_specific_strain_tracker">clade_specific_strain_tracker: List of markers present for a specific clade and all its subclades</option>
-                    <option value="marker_ab_table">marker_ab_table: Normalized marker counts (only when > 0.0 and normalized by microbiota size if number of reads is specified)</option>
+                    <option value="marker_ab_table">marker_ab_table: Normalized marker counts (only when &gt; 0.0 and normalized by microbiota size if number of reads is specified)</option>
                     <option value="marker_counts">marker_counts: Non-normalized marker counts (use with extreme caution)</option>
                     <option value="marker_pres_table">marker_pres_table: List of markers present in the sample (threshold at 1.0 if not differently specified with --pres_th</option>
                 </param>
@@ -299,24 +328,20 @@
                 <when value="reads_map"/>
                 <when value="clade_profiles"/>
                 <when value="clade_specific_strain_tracker">
-                    <param argument="--clade" type="text" value="" label="Clade for which to extract list of markers present" 
-                        help="Markers are also extracted for subclades" />
+                    <param argument="--clade" type="text" value="" label="Clade for which to extract list of markers present" help="Markers are also extracted for subclades"/>
                     <param argument="--min_ab" type="float" optional="true" label="The minimum percentage abundance for the clade"/>
                 </when>
                 <when value="marker_ab_table">
-                    <param argument="--nreads" type="integer" optional="true" label="Total number of reads in the original microbiota" 
-                        help="It is used for normalizing the length-normalized counts with the microbiota size as well. No normalization applied if the value is not specified"/>
+                    <param argument="--nreads" type="integer" optional="true" label="Total number of reads in the original microbiota" help="It is used for normalizing the length-normalized counts with the microbiota size as well. No normalization applied if the value is not specified"/>
                 </when>
                 <when value="marker_counts"/>
                 <when value="marker_pres_table">
                     <param argument="--pres_th" type="integer" optional="true" label="Threshold for calling a marker present"/>
                 </when>
             </conditional>
-            <param argument="--min_cu_len" type="integer" value="2000" 
-                label="Minimum total nucleotide length for the markers in a clade for estimating the abundance without considering sub-clade abundances"/>
-            <param argument="--min_alignment_len" type="integer" optional="true" 
-                label="Sam records for aligned reads with the longest subalignment length smaller than this threshold will be discarded."/>
-            <param name="organism_profiling" type="select" multiple="true" optional="true" label="Organisms to profile">
+            <param argument="--min_cu_len" type="integer" value="2000" label="Minimum total nucleotide length for the markers in a clade for estimating the abundance without considering sub-clade abundances"/>
+            <param argument="--min_alignment_len" type="integer" optional="true" label="Sam records for aligned reads with the longest subalignment length smaller than this threshold will be discarded."/>
+            <param name="organism_profiling" type="select" optional="true" label="Organisms to profile" multiple="true">
                 <option value="add_viruses" selected="true">Profile viral organisms (add_viruses)</option>
                 <option value="ignore_eukaryotes">Ignore eukaryotic organisms (ignore_eukaryotes)</option>
                 <option value="ignore_bacteria">Ignore bacteria organisms (ignore_bacteria)</option>
@@ -334,48 +359,81 @@
             <param argument="--stat_q" type="float" value="0.2" label="Quantile value for the robust average"/>
             <param argument="--perc_nonzero" type="float" value="0.33" label="Percentage of markers with a non zero relative abundance for misidentify a species"/>
             <param argument="--ignore_markers" type="data" format="txt,tabular" optional="true" label="File containing a list of markers to ignore" help="One marker per line"/>
-            <param argument="--avoid_disqm" type='boolean' checked="true" truevalue='--avoid_disqm' falsevalue='' 
-                label="Deactivate the procedure of disambiguating the quasi-markers based on the marker abundance pattern found in the sample?"
-                help="It is generally recommended to keep the disambiguation procedure in order to minimize false positives"/>
+            <param argument="--avoid_disqm" type="boolean" truevalue="--avoid_disqm" falsevalue="" checked="true" label="Deactivate the procedure of disambiguating the quasi-markers based on the marker abundance pattern found in the sample?" help="It is generally recommended to keep the disambiguation procedure in order to minimize false positives"/>
         </section>
+        <conditional name="subsample">
+            <param name="selector" type="select" label="Subsample" help="Subsampling only works for fastq input">
+                <option value="no">No</option>
+                <option value="single">Yes: specify number of reads</option>
+                <option value="paired">Yes: specify number of paired reads</option>
+            </param>
+            <when value="no"/>
+            <when value="single">
+                <param argument="--subsampling" type="integer" min="1" value="" label="Sumbsample reads" help="Specify the number of reads to be considered"/>
+                <expand macro="subsample_common"/>
+            </when>
+            <when value="paired">
+                <param argument="--subsampling_paired" type="integer" min="1" value="" label="Sumbsample reads" help="Specify the number of paired reads to be considered. For N there will be floor(N/2) reads selected from the forward and reverse reads each."/>
+                <expand macro="subsample_common"/>
+            </when>
+        </conditional>
+        <conditional name="viral_analysis">
+            <param argument="--profile_vsc" type="select" label="Profile Viruses with VSCs approach">
+                <option value="--profile_vsc">Yes (requires FASTQ input and reference data with VSG fasta)</option>
+                <option value="" selected="true">No</option>
+            </param>
+            <when value="--profile_vsc">
+                <param argument="--vsc_breadth" type="float" min="0" max="1" value="0.75" label="Minimum Breadth of Coverage" help="Minimum coverage (fraction) for a Viral Group to be reported."/>
+            </when>
+            <when value=""/>
+        </conditional>
         <section name="out" title="Outputs" expanded="true">
             <param argument="--sample_id_key" type="text" value="SampleID" label="Sample ID key for this analysis"/>
             <param argument="--sample_id" type="text" value="Metaphlan_Analysis" label="Sample ID for this analysis"/>
-            <param argument="--use_group_representative" type='boolean' checked="false" truevalue='--use_group_representative' falsevalue='' 
-                label="Use a species as representative for species groups?"/>
-            <param argument="--legacy-output" type='boolean' checked="false" truevalue='--legacy-output' falsevalue='' 
-                label="Old MetaPhlAn2 two columns output?"/>
-            <param argument="--CAMI_format_output" type='boolean' checked="false" truevalue='--CAMI_format_output' falsevalue='' 
-                label="Report the profiling using the CAMI output format?"/>
-            <param argument="--unknown_estimation" type='boolean' checked="false" truevalue='--unknown_estimation' falsevalue='' 
-                label="Scale relative abundances to the number of reads mapping to known clades in order to estimate unknowness?"/>
-            <param name="krona_output" type='boolean' checked="false" truevalue='true' falsevalue='false' label="Output for Krona?"/>
+            <param argument="--use_group_representative" type="boolean" truevalue="--use_group_representative" falsevalue="" checked="false" label="Use a species as representative for species groups?"/>
+            <param argument="--legacy-output" type="boolean" truevalue="--legacy-output" falsevalue="" checked="false" label="Old MetaPhlAn2 two columns output?"/>
+            <param argument="--CAMI_format_output" type="boolean" truevalue="--CAMI_format_output" falsevalue="" checked="false" label="Report the profiling using the CAMI output format?"/>
+            <param argument="--unclassified_estimation" type="boolean" truevalue="--unclassified_estimation" falsevalue="" checked="false" label="Scale relative abundances to the number of reads mapping to known clades in order to estimate unknowness?"/>
+            <param name="krona_output" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Output for Krona?"/>
         </section>
+        <!-- enabling this in tests will allow metaphlan to download reference data (we do this only with the smallish TOY DB) -->
+        <param name="test" type="hidden" value="false"/>
     </inputs>
     <outputs>
-        <data name="output_file" format="tabular" label="${tool.name} on ${on_string}: Predicted taxon relative abundances" />
+        <data name="output_file" format="tabular" label="${tool.name} on ${on_string}: Predicted taxon relative abundances"/>
         <data name="bowtie2out" format="tabular" label="${tool.name} on ${on_string}: Bowtie2 output">
             <filter>inputs['in']['selector'] == "raw"</filter>
         </data>
         <data name="sam_output_file" format="sam" label="${tool.name} on ${on_string}: SAM file">
             <filter>inputs['in']['selector'] == "raw"</filter>
         </data>
-        <data name="biom_output_file" format="biom1" label="${tool.name} on ${on_string}: BIOM file" />
-        <collection name="levels" type="list" label="${tool.name} on ${on_string}: Predicted taxon relative abundances at each taxonomic levels" >
+        <data name="biom_output_file" format="biom1" label="${tool.name} on ${on_string}: BIOM file"/>
+        <collection name="levels" type="list" label="${tool.name} on ${on_string}: Predicted taxon relative abundances at each taxonomic levels">
             <discover_datasets pattern="(?P&lt;designation&gt;.+)" directory="split_levels/" format="tabular"/>
             <filter>analysis['analysis_type']['t'] in ['rel_ab', 'rel_ab_w_read_stats'] and analysis['analysis_type']['tax_lev']['tax_lev'] == "a" and analysis['analysis_type']['tax_lev']['split_levels']</filter>
         </collection>
         <data name="krona_output_file" format="tabular" label="${tool.name} on ${on_string}: Predicted taxon relative abundances for Krona">
             <filter>out['krona_output']</filter>
         </data>
+        <data name="vcs_breath_coverage" format="tabular" label="${tool.name} on ${on_string}: VSCs breadth-of-coverage">
+            <filter>viral_analysis['profile_vsc']</filter>
+        </data>
+        <data name="subsample_single" format="fastqsanger" from_work_dir="subsampled.out" label="${tool.name} on ${on_string}: Subsampled reads">
+            <filter>subsample['selector'] == 'single'</filter>
+        </data>
+        <collection name="subsample_paired" type="paired" label="${tool.name} on ${on_string}: Subsampled paired reads">
+            <data name="forward" format="fastqsanger" from_work_dir="subsampled.R1.out"/>
+            <data name="reverse" format="fastqsanger" from_work_dir="subsampled.R2.out"/>
+            <filter>subsample['selector'] == 'paired'</filter>
+        </collection>
     </outputs>
     <tests>
+        <!-- Single GZ file, Cached db -->
         <test expect_num_outputs="6">
             <section name="inputs">
                 <conditional name="in">
                     <param name="selector" value="raw"/>
                     <conditional name="raw_in">
-                        <!-- Single GZ file -->
                         <param name="selector" value="single"/>
                         <param name="in" value="no_taxon_input.fasta"/>
                     </conditional>
@@ -386,7 +444,6 @@
                     </section>
                 </conditional>
                 <conditional name="db">
-                    <!-- Cached db -->
                     <param name="db_selector" value="cached"/>
                     <param name="cached_db" value="test-db-20210409"/>
                 </conditional>
@@ -412,7 +469,7 @@
                 <param name="use_group_representative" value="false"/>
                 <param name="legacy_output" value="false"/>
                 <param name="CAMI_format_output" value="false"/>
-                <param name="unknown_estimation" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
                 <param name="krona_output" value="true"/>
             </section>
             <output name="output_file" ftype="tabular">
@@ -516,13 +573,16 @@
                     <has_size value="1" delta="1"/>
                 </assert_contents>
             </output>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
         </test>
+        <!-- Single GZ file, Cached db -->
         <test expect_num_outputs="6">
             <section name="inputs">
                 <conditional name="in">
                     <param name="selector" value="raw"/>
                     <conditional name="raw_in">
-                        <!-- Single GZ file -->
                         <param name="selector" value="single"/>
                         <param name="in" value="SRS014464-Anterior_nares.fasta.gz"/>
                     </conditional>
@@ -533,7 +593,6 @@
                     </section>
                 </conditional>
                 <conditional name="db">
-                    <!-- Cached db -->
                     <param name="db_selector" value="cached"/>
                     <param name="cached_db" value="test-db-20210409"/>
                 </conditional>
@@ -559,7 +618,7 @@
                 <param name="use_group_representative" value="false"/>
                 <param name="legacy_output" value="false"/>
                 <param name="CAMI_format_output" value="false"/>
-                <param name="unknown_estimation" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
                 <param name="krona_output" value="true"/>
             </section>
             <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size">
@@ -583,7 +642,7 @@
                     <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/>
                 </assert_contents>
             </output>
-            <output_collection name="levels" type="list" >
+            <output_collection name="levels" type="list">
                 <element name="all" ftype="tabular">
                     <assert_contents>
                         <has_text text="Gammaproteobacteria"/>
@@ -661,13 +720,16 @@
                     <has_n_columns n="9"/>
                 </assert_contents>
             </output>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
         </test>
+        <!-- Multiple GZ file, Local db-->
         <test expect_num_outputs="4">
             <section name="inputs">
                 <conditional name="in">
                     <param name="selector" value="raw"/>
                     <conditional name="raw_in">
-                        <!-- Multiple GZ file -->
                         <param name="selector" value="multiple"/>
                         <param name="in" value="SRS014464-Anterior_nares.fasta.gz,SRS014464-Anterior_nares.fasta.gz"/>
                     </conditional>
@@ -678,7 +740,6 @@
                     </section>
                 </conditional>
                 <conditional name="db">
-                    <!-- Local db -->
                     <param name="db_selector" value="history"/>
                     <param name="bowtie2db" value="test-db.fasta"/>
                     <param name="mpa_pkl" value="test-db.json"/>
@@ -705,7 +766,7 @@
                 <param name="use_group_representative" value="false"/>
                 <param name="legacy_output" value="false"/>
                 <param name="CAMI_format_output" value="false"/>
-                <param name="unknown_estimation" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
                 <param name="krona_output" value="false"/>
             </section>
             <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size">
@@ -732,16 +793,19 @@
                     <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/>
                 </assert_contents>
             </output>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
         </test>
-        <test expect_num_outputs="4">
+        <!-- Paired GZ file, Cached db (note sumsample_paired and the included forward and reverse reads are counted separatelym because they are all statically defined) -->
+        <test expect_num_outputs="7">
             <section name="inputs">
                 <conditional name="in">
                     <param name="selector" value="raw"/>
                     <conditional name="raw_in">
-                        <!-- Paired GZ file -->
                         <param name="selector" value="paired"/>
-                        <param name="in_f" value="SRS014464-Anterior_nares.fasta.gz"/>
-                        <param name="in_r" value="SRS014464-Anterior_nares.fasta.gz"/>
+                        <param name="in_f" value="SRS014464-Anterior_nares.fastq.gz"/>
+                        <param name="in_r" value="SRS014464-Anterior_nares.fastq.gz"/>
                     </conditional>
                     <param name="read_min_len" value="70"/>
                     <section name="mapping">
@@ -750,7 +814,183 @@
                     </section>
                 </conditional>
                 <conditional name="db">
-                    <!-- Cached db -->
+                    <param name="db_selector" value="cached"/>
+                    <param name="cached_db" value="test-db-20210409"/>
+                </conditional>
+            </section>
+            <section name="analysis">
+                <conditional name="analysis_type">
+                    <param name="t" value="rel_ab"/>
+                    <conditional name="tax_lev">
+                        <param name="tax_lev" value="a"/>
+                        <param name="split_levels" value="false"/>
+                    </conditional>
+                </conditional>
+                <param name="min_cu_len" value="2000"/>
+                <param name="organism_profiling" value="add_viruses"/>
+                <param name="stat" value="avg_g"/>
+                <param name="stat_q" value="0.2"/>
+                <param name="perc_nonzero" value="0.33"/>
+                <param name="avoid_disqm" value="true"/>
+            </section>
+            <conditional name="subsample">
+                <param name="selector" value="paired"/>
+                <param name="subsampling_paired" value="20257"/>
+                <param name="subsampling_seed" value="42"/>
+            </conditional>
+            <section name="out">
+                <param name="sample_id_key" value="SampleID"/>
+                <param name="sample_id" value="Metaphlan_Analysis"/>
+                <param name="use_group_representative" value="false"/>
+                <param name="legacy_output" value="false"/>
+                <param name="CAMI_format_output" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
+                <param name="krona_output" value="false"/>
+            </section>
+            <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size">
+                <assert_contents>
+                    <has_text text="k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_accolens"/>
+                    <has_text text="relative_abundance"/>
+                    <has_text text="NCBI_tax_id"/>
+                    <has_text text="clade_name"/>
+                </assert_contents>
+            </output>
+            <output name="bowtie2out" ftype="tabular" file="SRS014464-Anterior_nares-two-inputs-bowtie2out.tabular" compare="sim_size">
+                <assert_contents>
+                    <has_text text="HWI-EAS109_102883399:3:107:9938:7093/1"/>
+                    <has_text text="90240__A0A378QWM4__NCTC12877_00123"/>
+                </assert_contents>
+            </output>
+            <output name="sam_output_file" ftype="sam">
+                <assert_contents>
+                    <has_size min="52400" max="52600"/>
+                    <has_text text="SN:13076__A0A2I1PE66__CYJ72_10760"/>
+                </assert_contents>
+            </output>
+            <output name="biom_output_file" ftype="biom1" file="SRS014464-Anterior_nares.biom" compare="sim_size">
+                <assert_contents>
+                    <has_text text="k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_accolens"/>
+                </assert_contents>
+            </output>
+            <output_collection name="subsample_paired" type="paired">
+                <element name="forward">
+                    <assert_contents>
+                        <has_line_matching expression="^@.*" n="10128"/>
+                    </assert_contents>
+                </element>
+                <element name="reverse">
+                    <assert_contents>
+                        <has_line_matching expression="^@.*" n="10128"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
+        </test>
+        <!-- Paired GZ file as collection, Cached db (note sumsample_paired and the included forward and reverse reads are counted separatelym because they are all statically defined) -->
+        <test expect_num_outputs="7">
+            <section name="inputs">
+                <conditional name="in">
+                    <param name="selector" value="raw"/>
+                    <conditional name="raw_in">
+                        <param name="selector" value="paired_collection"/>
+                        <param name="in">
+                            <collection type="paired" name="pair">
+                                <element name="forward" value="SRS014464-Anterior_nares.fastq.gz"/>
+                                <element name="reverse" value="SRS014464-Anterior_nares.fastq.gz"/>
+                            </collection>
+                        </param>
+                    </conditional>
+                    <param name="read_min_len" value="70"/>
+                    <section name="mapping">
+                        <param name="bt2_ps" value="sensitive"/>
+                        <param name="min_mapq_val" value="5"/>
+                    </section>
+                </conditional>
+                <conditional name="db">
+                    <param name="db_selector" value="cached"/>
+                    <param name="cached_db" value="test-db-20210409"/>
+                </conditional>
+            </section>
+            <section name="analysis">
+                <conditional name="analysis_type">
+                    <param name="t" value="rel_ab"/>
+                    <conditional name="tax_lev">
+                        <param name="tax_lev" value="a"/>
+                        <param name="split_levels" value="false"/>
+                    </conditional>
+                </conditional>
+                <param name="min_cu_len" value="2000"/>
+                <param name="organism_profiling" value="add_viruses"/>
+                <param name="stat" value="avg_g"/>
+                <param name="stat_q" value="0.2"/>
+                <param name="perc_nonzero" value="0.33"/>
+                <param name="avoid_disqm" value="true"/>
+            </section>
+            <conditional name="subsample">
+                <param name="selector" value="paired"/>
+                <param name="subsampling_paired" value="20257"/>
+                <param name="subsampling_seed" value="42"/>
+            </conditional>
+            <section name="out">
+                <param name="sample_id_key" value="SampleID"/>
+                <param name="sample_id" value="Metaphlan_Analysis"/>
+                <param name="use_group_representative" value="false"/>
+                <param name="legacy_output" value="false"/>
+                <param name="CAMI_format_output" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
+                <param name="krona_output" value="false"/>
+            </section>
+            <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size">
+                <assert_contents>
+                    <has_text text="k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_accolens"/>
+                    <has_text text="relative_abundance"/>
+                    <has_text text="NCBI_tax_id"/>
+                    <has_text text="clade_name"/>
+                </assert_contents>
+            </output>
+            <output name="bowtie2out" ftype="tabular" file="SRS014464-Anterior_nares-two-inputs-bowtie2out.tabular" compare="sim_size">
+                <assert_contents>
+                    <has_text text="HWI-EAS109_102883399:3:107:9938:7093/1"/>
+                    <has_text text="90240__A0A378QWM4__NCTC12877_00123"/>
+                </assert_contents>
+            </output>
+            <output name="sam_output_file" ftype="sam">
+                <assert_contents>
+                    <has_size min="52400" max="52600"/>
+                    <has_text text="SN:13076__A0A2I1PE66__CYJ72_10760"/>
+                </assert_contents>
+            </output>
+            <output name="biom_output_file" ftype="biom1" file="SRS014464-Anterior_nares.biom" compare="sim_size">
+                <assert_contents>
+                    <has_text text="k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_accolens"/>
+                </assert_contents>
+            </output>
+            <output_collection name="subsample_paired" type="paired">
+                <element name="forward">
+                    <assert_contents>
+                        <has_line_matching expression="^@.*" n="10128"/>
+                    </assert_contents>
+                </element>
+                <element name="reverse">
+                    <assert_contents>
+                        <has_line_matching expression="^@.*" n="10128"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
+        </test>
+        <!-- SAM, cached DB -->
+        <test expect_num_outputs="2">
+            <section name="inputs">
+                <conditional name="in">
+                    <param name="selector" value="sam"/>
+                    <param name="in" value="SRS014464-Anterior_nares.sam"/>
+                </conditional>
+                <conditional name="db">
                     <param name="db_selector" value="cached"/>
                     <param name="cached_db" value="test-db-20210409"/>
                 </conditional>
@@ -776,69 +1016,7 @@
                 <param name="use_group_representative" value="false"/>
                 <param name="legacy_output" value="false"/>
                 <param name="CAMI_format_output" value="false"/>
-                <param name="unknown_estimation" value="false"/>
-                <param name="krona_output" value="false"/>
-            </section>
-            <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size">
-                <assert_contents>
-                    <has_text text="k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Corynebacteriales|f__Corynebacteriaceae|g__Corynebacterium|s__Corynebacterium_accolens"/>
-                    <has_text text="relative_abundance"/>
-                    <has_text text="NCBI_tax_id"/>
-                    <has_text text="clade_name"/>
-                </assert_contents>
-            </output>
-            <output name="bowtie2out" ftype="tabular" file="SRS014464-Anterior_nares-two-inputs-bowtie2out.tabular" compare="sim_size">
-                <assert_contents>
-                    <has_text text="HWI-EAS109_102883399:3:104:7342:14360/1"/>
-                    <has_text text="37637__U2I1U8__N579_01580"/>
-                </assert_contents>
-            </output>
-            <output name="sam_output_file" ftype="sam" file="SRS014464-Anterior_nares-two-inputs.sam" compare="sim_size">
-                <assert_contents>
-                    <has_text text="SN:13076__A0A2I1PE66__CYJ72_10760"/>
-                </assert_contents>
-            </output>
-            <output name="biom_output_file" ftype="biom1" file="SRS014464-Anterior_nares.biom" compare="sim_size">
-                <assert_contents>
-                    <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/>
-                </assert_contents>
-            </output>
-        </test>
-        <test expect_num_outputs="2">
-            <section name="inputs">
-                <conditional name="in">
-                    <!-- SAM -->
-                    <param name="selector" value="sam"/>
-                    <param name="in" value="SRS014464-Anterior_nares.sam"/>
-                </conditional>
-                <conditional name="db">
-                    <!-- Cached db -->
-                    <param name="db_selector" value="cached"/>
-                    <param name="cached_db" value="test-db-20210409"/>
-                </conditional>
-            </section>
-            <section name="analysis">
-                <conditional name="analysis_type">
-                    <param name="t" value="rel_ab"/>
-                    <conditional name="tax_lev">
-                        <param name="tax_lev" value="a"/>
-                        <param name="split_levels" value="false"/>
-                    </conditional>
-                </conditional>
-                <param name="min_cu_len" value="2000"/>
-                <param name="organism_profiling" value="add_viruses"/>
-                <param name="stat" value="avg_g"/>
-                <param name="stat_q" value="0.2"/>
-                <param name="perc_nonzero" value="0.33"/>
-                <param name="avoid_disqm" value="true"/>
-            </section>
-            <section name="out">
-                <param name="sample_id_key" value="SampleID"/>
-                <param name="sample_id" value="Metaphlan_Analysis"/>
-                <param name="use_group_representative" value="false"/>
-                <param name="legacy_output" value="false"/>
-                <param name="CAMI_format_output" value="false"/>
-                <param name="unknown_estimation" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
                 <param name="krona_output" value="false"/>
             </section>
             <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size">
@@ -854,16 +1032,18 @@
                     <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/>
                 </assert_contents>
             </output>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
         </test>
+        <!-- bowtie2out, cached DB -->
         <test expect_num_outputs="2">
             <section name="inputs">
                 <conditional name="in">
-                    <!-- bowtie2out -->
                     <param name="selector" value="bowtie2out"/>
                     <param name="in" value="SRS014464-Anterior_nares-bowtie2out.tabular"/>
                 </conditional>
                 <conditional name="db">
-                    <!-- Cached db -->
                     <param name="db_selector" value="cached"/>
                     <param name="cached_db" value="test-db-20210409"/>
                 </conditional>
@@ -897,7 +1077,7 @@
                 <param name="use_group_representative" value="false"/>
                 <param name="legacy_output" value="false"/>
                 <param name="CAMI_format_output" value="false"/>
-                <param name="unknown_estimation" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
                 <param name="krona_output" value="false"/>
             </section>
             <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-abundances.tabular" compare="sim_size">
@@ -913,13 +1093,16 @@
                     <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/>
                 </assert_contents>
             </output>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
         </test>
+        <!-- Single FASTA file, Cached db -->
         <test expect_num_outputs="6">
             <section name="inputs">
                 <conditional name="in">
                     <param name="selector" value="raw"/>
                     <conditional name="raw_in">
-                        <!-- Single FASTA file -->
                         <param name="selector" value="single"/>
                         <param name="in" value="SRS014464-Anterior_nares.fasta"/>
                     </conditional>
@@ -930,7 +1113,6 @@
                     </section>
                 </conditional>
                 <conditional name="db">
-                    <!-- Cached db -->
                     <param name="db_selector" value="cached"/>
                     <param name="cached_db" value="test-db-20210409"/>
                 </conditional>
@@ -957,7 +1139,7 @@
                 <param name="use_group_representative" value="false"/>
                 <param name="legacy_output" value="true"/>
                 <param name="CAMI_format_output" value="false"/>
-                <param name="unknown_estimation" value="false"/>
+                <param name="unclassified_estimation" value="false"/>
                 <param name="krona_output" value="true"/>
             </section>
             <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-legacy-abundances.tabular" compare="sim_size">
@@ -983,7 +1165,7 @@
                     <has_text text="k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Moraxella|s__Moraxella_lacunata"/>
                 </assert_contents>
             </output>
-            <output_collection name="levels" type="list" >
+            <output_collection name="levels" type="list">
                 <element name="all" ftype="tabular">
                     <assert_contents>
                         <has_text text="Gammaproteobacteria"/>
@@ -1054,20 +1236,24 @@
                     <has_n_columns n="9"/>
                 </assert_contents>
             </output>
+            <assert_stderr>
+                <has_text text="Downloading" negate="true"/>
+            </assert_stderr>
         </test>
-        <!-- Check a non-default analysis mode -->
-        <test expect_num_outputs="4">
+        <!-- Check a non-default analysis mode 
+             and viral analysis -->
+        <test expect_num_outputs="6">
             <section name="inputs">
                 <conditional name="in">
                     <param name="selector" value="raw"/>
                     <conditional name="raw_in">
                         <param name="selector" value="single"/>
-                        <param name="in" value="SRS014464-Anterior_nares.fasta"/>
+                        <param name="in" value="SRS014464-Anterior_nares.fastq.gz"/>
                     </conditional>
                 </conditional>
                 <conditional name="db">
                     <param name="db_selector" value="cached"/>
-                    <param name="cached_db" value="test-db-20210409"/>
+                    <param name="cached_db" value="mpa_vJan21_TOY_CHOCOPhlAnSGB"/>
                 </conditional>
             </section>
             <section name="analysis">
@@ -1075,13 +1261,44 @@
                     <param name="t" value="marker_ab_table"/>
                 </conditional>
             </section>
+            <conditional name="viral_analysis">
+                <param name="profile_vsc" value="--profile_vsc"/>
+                <param name="vsc_out" value="true"/>
+            </conditional>
+            <conditional name="subsample">
+                <param name="selector" value="single"/>
+                <param name="subsampling" value="10000"/>
+                <param name="subsampling_seed" value="42"/>
+            </conditional>
+            <param name="test" value="true"/>
             <output name="output_file" ftype="tabular" file="SRS014464-Anterior_nares-legacy-abundances.tabular" compare="sim_size">
                 <assert_contents>
-                    <has_text text="29394__H3NC06__B8A41_08715"/>
+                    <has_text text="SGB7017__MKDPKOFL_00679"/>
                     <has_text text="SampleID"/>
                     <has_text text="Metaphlan_Analysis"/>
                 </assert_contents>
             </output>
+            <output name="subsample_single">
+                <assert_contents>
+                    <has_text text="@" n="10000"/>
+                </assert_contents>
+            </output>
+            <!-- reference data empty -> empty output -->
+            <output name="vcs_breath_coverage" ftype="tabular">
+                <assert_contents>
+                    <has_size size="0"/>
+                </assert_contents>
+            </output>
+            <assert_command>
+                <has_text text="--profile_vsc"/>
+                <has_text text="--vsc_breadth 0.75"/>
+                <has_text text="--vsc_out"/>
+            </assert_command>
+            <assert_stderr>
+                <has_text text="Downloading"/>
+                <!-- due to test=true and the absence of the TOY reference DB Metaphlan will download to ~10MB-->
+                <has_text text="No reads aligning to VSC markers"/>
+            </assert_stderr>
         </test>
     </tests>
     <help><![CDATA[