diff pick_otus.xml @ 0:c1bd0c560018 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qiime commit bcbe76277f3e60303faf826f8ce7f018bc663a9a-dirty
author bebatut
date Tue, 02 Feb 2016 05:50:37 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pick_otus.xml	Tue Feb 02 05:50:37 2016 -0500
@@ -0,0 +1,412 @@
+<tool id="qiime_pick_otus" name="pick otus" version="1.9.1galaxy1">
+
+    <description>OTU picking</description>
+    
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+
+    <expand macro="requirements" />
+
+    <command>
+<![CDATA[
+        pick_otus.py 
+            -i $input_seqs_filepath 
+            -o fastasplit
+
+            #if str($methode.otu_picking_method) != 'None':
+             -m $methode.otu_picking_method
+            #end if
+
+            #if str($methode.otu_picking_method) in ("uclust_ref","usearch_ref") :
+                -r $methode.refseqs_fp
+            #end if
+
+            #if str($methode.otu_picking_method) in ("uclust","uclust_ref","usearch","usearch_ref","sumaclust") :
+                #if $methode.similarity:
+                    -s $methode.similarity
+                #end if
+            #end if
+
+            #if str($methode.otu_picking_method) == "sumaclust":
+                #if $methode.sumaclust_exact:
+                    --sumaclust_exact
+                #end if
+            #end if
+
+            #if str($methode.otu_picking_method) == "swarm":
+                #if $methode.swarm_resolution:
+                    --swarm_resolution=$methode.swarm_resolution
+                #end if
+            #end if
+
+            #if str($methode.otu_picking_method) in ("uclust","uclust_ref","usearch","usearch_ref"):
+                #if $methode.enable_rev_strand_match:
+                    -z
+                #end if
+                #if str($methode.max_accepts):
+                    --max_accepts=$methode.max_accepts
+                #end if
+
+                #if str($methode.max_rejects):
+                    --max_rejects=$methode.max_rejects
+                #end if
+            #end if
+
+            #if str($methode.otu_picking_method) in ("uclust","uclust_ref"):
+                #if $methode.stepwords:
+                    --stepwords=$methode.stepwords
+                #end if
+                #if $methode.suppress_presort_by_abundance_uclust:
+                    -D
+                #end if
+            #end if
+
+            #if str($methode.otu_picking_method) == "uclust":
+                #if $methode.optimal_uclust:
+                    -A
+                #end if
+                #if $methode.exact_uclust:
+                    -E
+                #end if
+            #end if
+
+            #if str($methode.otu_picking_method) == "usearch":
+                #if $methode.percent_id_err:
+                    -j $methode.percent_id_err
+                #end if
+                #if $methode.abundance_skew:
+                    -a $methode.abundance_skew
+                #end if
+                #if str($methode.db_filepath) != 'None':
+                    -f $methode.db_filepath
+                #end if
+                #if $methode.perc_id_blast:
+                    --perc_id_blast=$methode.perc_id_blast
+                #end if
+                #if $methode.suppress_de_novo_chimera_detection:
+                    -k
+                #end if
+            #end if
+
+            #if str($methode.otu_picking_method) in ("sumaclust","swarm"):
+                #if str($methode.threads):
+                    --threads=$methode.threads
+                #end if
+            #end if
+
+            #if $prefix_prefilter_length:
+                -n $prefix_prefilter_length
+            #end if
+
+            #if $prefix_length:
+                -p $prefix_length
+            #end if
+
+            #if $suffix_length:
+                -u $suffix_length
+            #end if
+
+            #if str($non_chimeras_retention):
+                -F $non_chimeras_retention
+            #end if
+]]>
+    </command>
+    
+    <inputs>
+        <param label="-i/--input_seqs_filepath: Path to input sequences file" 
+            name="input_seqs_filepath" optional="False" type="data"/>
+        <conditional name="methode">
+            <param label="-m/--otu_picking_method: Method for picking OTUs. Valid 
+                choices are: sortmerna, mothur, trie, uclust_ref, usearch, usearch_ref,
+                 blast, usearch61, usearch61_ref, sumaclust, swarm, prefix_suffix, 
+                 cdhit, uclust. The mothur method requires an input file of aligned 
+                 sequences.  usearch will enable the usearch quality filtering 
+                 pipeline. [default: uclust]" name="otu_picking_method" 
+                 optional="FALSE" type="select">
+                <option selected="True" value="uclust">uclust</option>
+                <option value="uclust_ref">uclust_ref</option>
+                <option value="usearch">usearch</option>
+                <option value="usearch_ref">usearch_ref</option>
+                <option value="sumaclust">sumaclust</option>
+                <option value="swarm">swarm</option>
+            </param>
+            <when value="uclust_ref">
+                <param default="/home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta" 
+                    label="-r/--refseqs_fp: Path to reference sequences to search 
+                    against when using -m uclust_ref, -m usearch_ref [default: 
+                    /home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta]" name="refseqs_fp" optional="True" type="data"/>
+                <param default="0.97" label="-s/--similarity: Sequence similarity 
+                    threshold (for blast, cdhit, uclust, uclust_ref, usearch, 
+                    usearch_ref, usearch61, usearch61_ref, sumaclust or sortmerna 
+                    [default: 0.97]" name="similarity" optional="True" type="float"/>
+                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
+                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
+                    or usearch61_ref otu picking, will double the amount of memory 
+                    used. [default: False]" name="enable_rev_strand_match" 
+                    selected="False" type="boolean"/>
+                <param label="-D/--suppress_presort_by_abundance_uclust: Suppress 
+                    presorting of sequences by abundance when picking OTUs with 
+                    uclust or uclust_ref [default: False]" 
+                    name="suppress_presort_by_abundance_uclust" selected="False" 
+                    type="boolean"/>
+                <param label="-C/--suppress_new_clusters: Suppress creation of new 
+                    clusters using seqs that don't match reference when using -m 
+                    uclust_ref, -m usearch61_ref, or -m usearch_ref [default: False]" 
+                    name="suppress_new_clusters" selected="False" type="boolean"/>
+                <param default="default" label="--max_accepts: max_accepts value 
+                    to uclust, uclust_ref, usearch61, and usearch61_ref.  By default, 
+                    will use value suggested by method (uclust: 1, usearch61: 1) 
+                    [default: default]" name="max_accepts" optional="True" type="text"/>
+                <param default="default" label="--max_rejects: max_rejects value 
+                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
+                    default settings, will use value recommended by clustering 
+                    method used (uclust: 8, usearch61: 8 for usearch_fast_cluster 
+                    option, 32 for reference and smallmem options) [default: 
+                    default]" name="max_rejects" optional="True" type="text"/>
+                <param default="8" label="--stepwords: stepwords value to uclust 
+                    and uclust_ref [default: 8]" name="stepwords" optional="True" 
+                    type="integer"/>
+            </when>
+            <when value="usearch_ref">
+                <param default="/home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta" 
+                    label="-r/--refseqs_fp: Path to reference sequences to search 
+                    against when using -m blast, -m sortmerna, -m uclust_ref, -m 
+                    usearch_ref, or -m usearch61_ref [default:
+                     /home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta]" name="refseqs_fp" optional="True" type="data"/>
+                <param default="0.97" label="-s/--similarity: Sequence similarity 
+                    threshold (for blast, cdhit, uclust, uclust_ref, usearch, 
+                    usearch_ref, usearch61, usearch61_ref, sumaclust or sortmerna 
+                    [default: 0.97]" name="similarity" optional="True" type="float"/>
+                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
+                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
+                    or usearch61_ref otu picking, will double the amount of memory 
+                    used. [default: False]" name="enable_rev_strand_match" selected="False" 
+                    type="boolean"/>
+                <param label="-C/--suppress_new_clusters: Suppress creation of new 
+                    clusters using seqs that don't match reference when using -m 
+                    uclust_ref, -m usearch61_ref, or -m usearch_ref [default: False]" 
+                    name="suppress_new_clusters" selected="False" type="boolean"/>
+                <param default="default" label="--max_accepts: max_accepts value 
+                    to uclust, uclust_ref, usearch61, and usearch61_ref.  By default, 
+                    will use value suggested by method (uclust: 1, usearch61: 1) 
+                    [default: default]" name="max_accepts" optional="True" type="text"/>
+                <param default="default" label="--max_rejects: max_rejects value 
+                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
+                    default settings, will use value recommended by clustering method 
+                    used (uclust: 8, usearch61: 8 for usearch_fast_cluster option, 
+                    32 for reference and smallmem options) [default: default]" 
+                    name="max_rejects" optional="True" type="text"/>
+            </when>
+            <when value="usearch">
+                <param default="0.97" value="0.97" label="-s/--similarity: Sequence 
+                    similarity threshold (for blast, cdhit, uclust, uclust_ref, 
+                    usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or 
+                    sortmerna [default: 0.97]" name="similarity" optional="True" 
+                    type="float"/>
+                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
+                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
+                    or usearch61_ref otu picking, will double the amount of memory 
+                    used. [default: False]" name="enable_rev_strand_match" 
+                    selected="False" type="boolean"/>
+                <param default="default" label="--max_accepts: max_accepts value 
+                    to uclust, uclust_ref, usearch61, and usearch61_ref.  By 
+                    default, will use value suggested by method (uclust: 1, 
+                    usearch61: 1) [default: default]" name="max_accepts" optional="True" 
+                    type="text"/>
+                <param default="default" label="--max_rejects: max_rejects value 
+                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
+                    default settings, will use value recommended by clustering 
+                    method used (uclust: 8, usearch61: 8 for usearch_fast_cluster 
+                    option, 32 for reference and smallmem options) [default: default]" 
+                    name="max_rejects" optional="True" type="text"/>
+                <param default="0.97" value="0.97" label="-j/--percent_id_err: 
+                    Percent identity threshold for cluster error detection with 
+                    usearch, expressed as a fraction between 0 and 1. [default: 0.97]" 
+                    name="percent_id_err" optional="True" type="float"/>
+                <param default="2.0" label="-a/--abundance_skew: Abundance skew 
+                    setting for de novo chimera detection with usearch. [default: 2.0]" 
+                    name="abundance_skew" optional="True" type="float"/>
+                <param default="None" label="-f/--db_filepath: Reference database 
+                    of fasta sequences for reference based chimera detection with 
+                    usearch. [default: None]" name="db_filepath" optional="True" 
+                    type="data"/>
+                <param default="0.97" value="0.97" label="--perc_id_blast: Percent 
+                    ID for mapping OTUs created by usearch back to original sequence 
+                    IDs [default: 0.97]" name="perc_id_blast" optional="True" 
+                    type="float"/>
+                <param label="-k/--suppress_de_novo_chimera_detection: Suppress 
+                    de novo chimera detection in usearch. [default: False]" 
+                    name="suppress_de_novo_chimera_detection" selected="False" 
+                    type="boolean"/>
+                <param label="--usearch_fast_cluster: Use fast clustering option 
+                    for usearch or usearch61_ref with new clusters.  
+                    --enable_rev_strand_match can not be enabled with this option, 
+                    and the only valid option for usearch61_sort_method is 'length'.  
+                    This option uses more memory than the default option for de novo 
+                    clustering. [default: False]" name="usearch_fast_cluster" 
+                    selected="False" type="boolean"/>
+            </when>
+            <when value="sumaclust">
+                <param default="0.97" value="0.97" label="-s/--similarity: Sequence 
+                    similarity threshold (for blast, cdhit, uclust, uclust_ref, 
+                    usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or 
+                    sortmerna [default: 0.97]" name="similarity" optional="True" 
+                    type="float"/>
+                <param label="--sumaclust_exact: A sequence is assigned to the best 
+                    matching seed rather than the first matching seed passing the 
+                    similarity threshold [default: False]" name="sumaclust_exact" 
+                    selected="False" type="boolean"/>
+                <param default="1" label="--threads: Specify number of threads (1 
+                    thread per core) to be used for usearch61, sortmerna, sumaclust and swarm commands that utilize multithreading. [default: 1]" name="threads" optional="True" type="text"/>
+            </when>
+            <when value="swarm">
+                <param default="1" label="--swarm_resolution: Maximum number of 
+                    differences allowed between two amplicons, meaning that two 
+                    amplicons will be grouped if they have integer (or less) 
+                    differences (see Swarm manual at https://github.com/torognes/swarm 
+                    for more details). [default: 1]" name="swarm_resolution" 
+                    optional="True" type="integer"/>
+                <param default="1" label="--threads: Specify number of threads (1 
+                    thread per core) to be used for usearch61, sortmerna, sumaclust 
+                    and swarm commands that utilize multithreading. [default: 1]" 
+                    name="threads" optional="True" type="text"/>
+            </when>
+            <when value="uclust">
+                <param default="0.97" value="0.97" label="-s/--similarity: Sequence 
+                    similarity threshold (for blast, cdhit, uclust, uclust_ref, 
+                    usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or 
+                    sortmerna [default: 0.97]" name="similarity" optional="True" 
+                    type="float"/>
+                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
+                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
+                    or usearch61_ref otu picking, will double the amount of memory 
+                    used. [default: False]" name="enable_rev_strand_match" selected="False" 
+                    type="boolean"/>
+                <param label="-A/--optimal_uclust: Pass the --optimal flag to uclust 
+                    for uclust otu picking. [default: False]" name="optimal_uclust" 
+                    selected="False" type="boolean"/>
+                <param label="-D/--suppress_presort_by_abundance_uclust: Suppress 
+                    presorting of sequences by abundance when picking OTUs with 
+                    uclust or uclust_ref [default: False]" 
+                    name="suppress_presort_by_abundance_uclust" selected="False" 
+                    type="boolean"/>
+                <param default="default" label="--max_accepts: max_accepts value 
+                    to uclust, uclust_ref, usearch61, and usearch61_ref. By default, 
+                    will use value suggested by method (uclust: 1, usearch61: 1) 
+                    [default: default]" name="max_accepts" optional="True" 
+                    type="text"/>
+                <param default="default" label="--max_rejects: max_rejects value 
+                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
+                    default settings, will use value recommended by clustering 
+                    method used (uclust: 8, usearch61: 8 for usearch_fast_cluster 
+                    option, 32 for reference and smallmem options) [default: default]" 
+                    name="max_rejects" optional="True" type="text"/>
+                <param default="8" label="--stepwords: stepwords value to uclust 
+                    and uclust_ref [default: 8]" name="stepwords" optional="True" 
+                    type="integer"/>
+                <param label="-E/--exact_uclust: Pass the --exact flag to uclust 
+                    for uclust otu picking. [default: False]" name="exact_uclust" 
+                    selected="False" type="boolean"/>
+            </when>
+        </conditional>
+
+        <param default="None" label="-n/--prefix_prefilter_length: Prefilter data 
+            so seqs with identical first prefix_prefilter_length are automatically 
+            grouped into a single OTU.  This is useful for large sequence collections 
+            where OTU picking doesn't scale well [default: None; 100 is a good value]" 
+            name="prefix_prefilter_length" optional="True" type="integer"/>
+        <param default="50" label="-p/--prefix_length: Prefix length when using 
+            the prefix_suffix otu picker; WARNING: CURRENTLY DIFFERENT FROM 
+            prefix_prefilter_length (-n)! [default: 50]" name="prefix_length" 
+            optional="True" type="integer"/>
+        <param default="50" label="-u/--suffix_length: Suffix length when using 
+            the prefix_suffix otu picker [default: 50]" name="suffix_length" 
+            optional="True" type="integer"/>
+        <param default="union" label="-F/--non_chimeras_retention: Selects subsets 
+            of sequences detected as non-chimeras to retain after de novo and 
+            reference based chimera detection.  Options are intersection or union.  
+            union will retain sequences that are flagged as non-chimeric from either 
+            filter, while intersection will retain only those sequences that are 
+            flagged as non-chimeras from both detection methods. [default: union]" 
+            name="non_chimeras_retention" optional="True" type="text"/>
+    </inputs>
+    <outputs>
+        <data format="txt" from_work_dir="fastasplit/*_otus.txt" 
+            name="pick_otus.txt" label="pick_otus.txt"/>
+        <data format="txt" from_work_dir="fastasplit/*_otus.log" 
+            name="pick_otus.log" label="pick_otus.log"/>
+        <data format="txt" from_work_dir="fastasplit/*_failures.txt" 
+            name="pick_otus_failures.txt" label="pick_otus_failures.txt"/>
+    </outputs>
+
+    <tests>
+        <test>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+**What it does**
+
+The OTU picking step assigns similar sequences to operational taxonomic units, or OTUs, by clustering sequences based on a user-defined similarity threshold. Sequences which are similar at or above the threshold level are taken to represent the presence of a taxonomic unit (e.g., a genus, when the similarity threshold is set at 0.94) in the sequence collection.
+
+Currently, the following clustering methods have been implemented in QIIME:
+
+1.  uclust, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity.
+
+2.  uclust_ref, as uclust, but takes a reference database to use as seeds.  New clusters can be toggled on or off.
+
+3.  usearch, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity, filters low abundance clusters, performs de novo and reference based chimera detection.
+
+4.  usearch_ref, as usearch, but takes a reference database to use as seeds.  New clusters can be toggled on or off.
+
+5. sumaclust, creates &quot;seeds&quot; of sequences which generate clusters based on similarity threshold.
+
+6. swarm, creates &quot;seeds&quot; of sequences which generate clusters based on a resolution threshold.
+
+
+Chimera checking with usearch 6.X is implemented in identify_chimeric_seqs.py.  Chimera checking should be done first with usearch 6.X, and the filtered resulting fasta file can then be clustered.
+
+
+The primary inputs for pick_otus.py are:
+
+1. A FASTA file containing sequences to be clustered
+
+2. An OTU threshold (default is 0.97, roughly corresponding to species-level OTUs);
+
+3. The method to be applied for clustering sequences into OTUs.
+
+pick_otus.py takes a standard fasta file as input.
+
+
+The output consists of two files (i.e. seqs_otus.txt and seqs_otus.log). The .txt file is composed of tab-delimited lines, where the first field on each line corresponds to an (arbitrary) cluster identifier, and the remaining fields correspond to sequence identifiers assigned to that cluster. Sequence identifiers correspond to those provided in the input FASTA file.  Usearch (i.e. usearch quality filter) can additionally have log files for each intermediate call to usearch.
+
+Example lines from the resulting .txt file:
+
+=   ====    ====    ====
+0   seq1    seq5
+1   seq2
+2   seq3
+3   seq4    seq6    seq7
+=   ====    ====    ====
+
+This result implies that four clusters were created based on 7 input sequences. 
+The first cluster (cluster id 0) contains two sequences, sequence ids seq1 and 
+seq5; the second cluster (cluster id 1) contains one sequence, sequence id seq2; 
+the third cluster (cluster id 2) contains one sequence, sequence id seq3, and the 
+final cluster (cluster id 3) contains three sequences, sequence ids seq4, seq6, 
+and seq7.
+
+The resulting .log file contains a list of parameters passed to the pick_otus.py 
+script along with the output location of the resulting .txt file.</help>
+    ]]>
+    </help>
+
+    <citations>
+        <expand macro="citations" />
+        <citation type="doi">10.1093/bioinformatics/btv231</citation>
+        <citation type="doi">10.1093/bioinformatics/btq461</citation>
+        <citation type="doi">10.1093/bioinformatics/bts611</citation>
+        <citation type="doi">10.7287/peerj.preprints.386v1/supp-1</citation>
+    </citations>
+</tool>