view pick_otus.xml @ 0:c1bd0c560018 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qiime commit bcbe76277f3e60303faf826f8ce7f018bc663a9a-dirty
author bebatut
date Tue, 02 Feb 2016 05:50:37 -0500
parents
children
line wrap: on
line source

<tool id="qiime_pick_otus" name="pick otus" version="1.9.1galaxy1">

    <description>OTU picking</description>
    
    <macros>
        <import>macros.xml</import>
    </macros>

    <expand macro="requirements" />

    <command>
<![CDATA[
        pick_otus.py 
            -i $input_seqs_filepath 
            -o fastasplit

            #if str($methode.otu_picking_method) != 'None':
             -m $methode.otu_picking_method
            #end if

            #if str($methode.otu_picking_method) in ("uclust_ref","usearch_ref") :
                -r $methode.refseqs_fp
            #end if

            #if str($methode.otu_picking_method) in ("uclust","uclust_ref","usearch","usearch_ref","sumaclust") :
                #if $methode.similarity:
                    -s $methode.similarity
                #end if
            #end if

            #if str($methode.otu_picking_method) == "sumaclust":
                #if $methode.sumaclust_exact:
                    --sumaclust_exact
                #end if
            #end if

            #if str($methode.otu_picking_method) == "swarm":
                #if $methode.swarm_resolution:
                    --swarm_resolution=$methode.swarm_resolution
                #end if
            #end if

            #if str($methode.otu_picking_method) in ("uclust","uclust_ref","usearch","usearch_ref"):
                #if $methode.enable_rev_strand_match:
                    -z
                #end if
                #if str($methode.max_accepts):
                    --max_accepts=$methode.max_accepts
                #end if

                #if str($methode.max_rejects):
                    --max_rejects=$methode.max_rejects
                #end if
            #end if

            #if str($methode.otu_picking_method) in ("uclust","uclust_ref"):
                #if $methode.stepwords:
                    --stepwords=$methode.stepwords
                #end if
                #if $methode.suppress_presort_by_abundance_uclust:
                    -D
                #end if
            #end if

            #if str($methode.otu_picking_method) == "uclust":
                #if $methode.optimal_uclust:
                    -A
                #end if
                #if $methode.exact_uclust:
                    -E
                #end if
            #end if

            #if str($methode.otu_picking_method) == "usearch":
                #if $methode.percent_id_err:
                    -j $methode.percent_id_err
                #end if
                #if $methode.abundance_skew:
                    -a $methode.abundance_skew
                #end if
                #if str($methode.db_filepath) != 'None':
                    -f $methode.db_filepath
                #end if
                #if $methode.perc_id_blast:
                    --perc_id_blast=$methode.perc_id_blast
                #end if
                #if $methode.suppress_de_novo_chimera_detection:
                    -k
                #end if
            #end if

            #if str($methode.otu_picking_method) in ("sumaclust","swarm"):
                #if str($methode.threads):
                    --threads=$methode.threads
                #end if
            #end if

            #if $prefix_prefilter_length:
                -n $prefix_prefilter_length
            #end if

            #if $prefix_length:
                -p $prefix_length
            #end if

            #if $suffix_length:
                -u $suffix_length
            #end if

            #if str($non_chimeras_retention):
                -F $non_chimeras_retention
            #end if
]]>
    </command>
    
    <inputs>
        <param label="-i/--input_seqs_filepath: Path to input sequences file" 
            name="input_seqs_filepath" optional="False" type="data"/>
        <conditional name="methode">
            <param label="-m/--otu_picking_method: Method for picking OTUs. Valid 
                choices are: sortmerna, mothur, trie, uclust_ref, usearch, usearch_ref,
                 blast, usearch61, usearch61_ref, sumaclust, swarm, prefix_suffix, 
                 cdhit, uclust. The mothur method requires an input file of aligned 
                 sequences.  usearch will enable the usearch quality filtering 
                 pipeline. [default: uclust]" name="otu_picking_method" 
                 optional="FALSE" type="select">
                <option selected="True" value="uclust">uclust</option>
                <option value="uclust_ref">uclust_ref</option>
                <option value="usearch">usearch</option>
                <option value="usearch_ref">usearch_ref</option>
                <option value="sumaclust">sumaclust</option>
                <option value="swarm">swarm</option>
            </param>
            <when value="uclust_ref">
                <param default="/home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta" 
                    label="-r/--refseqs_fp: Path to reference sequences to search 
                    against when using -m uclust_ref, -m usearch_ref [default: 
                    /home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta]" name="refseqs_fp" optional="True" type="data"/>
                <param default="0.97" label="-s/--similarity: Sequence similarity 
                    threshold (for blast, cdhit, uclust, uclust_ref, usearch, 
                    usearch_ref, usearch61, usearch61_ref, sumaclust or sortmerna 
                    [default: 0.97]" name="similarity" optional="True" type="float"/>
                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
                    or usearch61_ref otu picking, will double the amount of memory 
                    used. [default: False]" name="enable_rev_strand_match" 
                    selected="False" type="boolean"/>
                <param label="-D/--suppress_presort_by_abundance_uclust: Suppress 
                    presorting of sequences by abundance when picking OTUs with 
                    uclust or uclust_ref [default: False]" 
                    name="suppress_presort_by_abundance_uclust" selected="False" 
                    type="boolean"/>
                <param label="-C/--suppress_new_clusters: Suppress creation of new 
                    clusters using seqs that don't match reference when using -m 
                    uclust_ref, -m usearch61_ref, or -m usearch_ref [default: False]" 
                    name="suppress_new_clusters" selected="False" type="boolean"/>
                <param default="default" label="--max_accepts: max_accepts value 
                    to uclust, uclust_ref, usearch61, and usearch61_ref.  By default, 
                    will use value suggested by method (uclust: 1, usearch61: 1) 
                    [default: default]" name="max_accepts" optional="True" type="text"/>
                <param default="default" label="--max_rejects: max_rejects value 
                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
                    default settings, will use value recommended by clustering 
                    method used (uclust: 8, usearch61: 8 for usearch_fast_cluster 
                    option, 32 for reference and smallmem options) [default: 
                    default]" name="max_rejects" optional="True" type="text"/>
                <param default="8" label="--stepwords: stepwords value to uclust 
                    and uclust_ref [default: 8]" name="stepwords" optional="True" 
                    type="integer"/>
            </when>
            <when value="usearch_ref">
                <param default="/home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta" 
                    label="-r/--refseqs_fp: Path to reference sequences to search 
                    against when using -m blast, -m sortmerna, -m uclust_ref, -m 
                    usearch_ref, or -m usearch61_ref [default:
                     /home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta]" name="refseqs_fp" optional="True" type="data"/>
                <param default="0.97" label="-s/--similarity: Sequence similarity 
                    threshold (for blast, cdhit, uclust, uclust_ref, usearch, 
                    usearch_ref, usearch61, usearch61_ref, sumaclust or sortmerna 
                    [default: 0.97]" name="similarity" optional="True" type="float"/>
                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
                    or usearch61_ref otu picking, will double the amount of memory 
                    used. [default: False]" name="enable_rev_strand_match" selected="False" 
                    type="boolean"/>
                <param label="-C/--suppress_new_clusters: Suppress creation of new 
                    clusters using seqs that don't match reference when using -m 
                    uclust_ref, -m usearch61_ref, or -m usearch_ref [default: False]" 
                    name="suppress_new_clusters" selected="False" type="boolean"/>
                <param default="default" label="--max_accepts: max_accepts value 
                    to uclust, uclust_ref, usearch61, and usearch61_ref.  By default, 
                    will use value suggested by method (uclust: 1, usearch61: 1) 
                    [default: default]" name="max_accepts" optional="True" type="text"/>
                <param default="default" label="--max_rejects: max_rejects value 
                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
                    default settings, will use value recommended by clustering method 
                    used (uclust: 8, usearch61: 8 for usearch_fast_cluster option, 
                    32 for reference and smallmem options) [default: default]" 
                    name="max_rejects" optional="True" type="text"/>
            </when>
            <when value="usearch">
                <param default="0.97" value="0.97" label="-s/--similarity: Sequence 
                    similarity threshold (for blast, cdhit, uclust, uclust_ref, 
                    usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or 
                    sortmerna [default: 0.97]" name="similarity" optional="True" 
                    type="float"/>
                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
                    or usearch61_ref otu picking, will double the amount of memory 
                    used. [default: False]" name="enable_rev_strand_match" 
                    selected="False" type="boolean"/>
                <param default="default" label="--max_accepts: max_accepts value 
                    to uclust, uclust_ref, usearch61, and usearch61_ref.  By 
                    default, will use value suggested by method (uclust: 1, 
                    usearch61: 1) [default: default]" name="max_accepts" optional="True" 
                    type="text"/>
                <param default="default" label="--max_rejects: max_rejects value 
                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
                    default settings, will use value recommended by clustering 
                    method used (uclust: 8, usearch61: 8 for usearch_fast_cluster 
                    option, 32 for reference and smallmem options) [default: default]" 
                    name="max_rejects" optional="True" type="text"/>
                <param default="0.97" value="0.97" label="-j/--percent_id_err: 
                    Percent identity threshold for cluster error detection with 
                    usearch, expressed as a fraction between 0 and 1. [default: 0.97]" 
                    name="percent_id_err" optional="True" type="float"/>
                <param default="2.0" label="-a/--abundance_skew: Abundance skew 
                    setting for de novo chimera detection with usearch. [default: 2.0]" 
                    name="abundance_skew" optional="True" type="float"/>
                <param default="None" label="-f/--db_filepath: Reference database 
                    of fasta sequences for reference based chimera detection with 
                    usearch. [default: None]" name="db_filepath" optional="True" 
                    type="data"/>
                <param default="0.97" value="0.97" label="--perc_id_blast: Percent 
                    ID for mapping OTUs created by usearch back to original sequence 
                    IDs [default: 0.97]" name="perc_id_blast" optional="True" 
                    type="float"/>
                <param label="-k/--suppress_de_novo_chimera_detection: Suppress 
                    de novo chimera detection in usearch. [default: False]" 
                    name="suppress_de_novo_chimera_detection" selected="False" 
                    type="boolean"/>
                <param label="--usearch_fast_cluster: Use fast clustering option 
                    for usearch or usearch61_ref with new clusters.  
                    --enable_rev_strand_match can not be enabled with this option, 
                    and the only valid option for usearch61_sort_method is 'length'.  
                    This option uses more memory than the default option for de novo 
                    clustering. [default: False]" name="usearch_fast_cluster" 
                    selected="False" type="boolean"/>
            </when>
            <when value="sumaclust">
                <param default="0.97" value="0.97" label="-s/--similarity: Sequence 
                    similarity threshold (for blast, cdhit, uclust, uclust_ref, 
                    usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or 
                    sortmerna [default: 0.97]" name="similarity" optional="True" 
                    type="float"/>
                <param label="--sumaclust_exact: A sequence is assigned to the best 
                    matching seed rather than the first matching seed passing the 
                    similarity threshold [default: False]" name="sumaclust_exact" 
                    selected="False" type="boolean"/>
                <param default="1" label="--threads: Specify number of threads (1 
                    thread per core) to be used for usearch61, sortmerna, sumaclust and swarm commands that utilize multithreading. [default: 1]" name="threads" optional="True" type="text"/>
            </when>
            <when value="swarm">
                <param default="1" label="--swarm_resolution: Maximum number of 
                    differences allowed between two amplicons, meaning that two 
                    amplicons will be grouped if they have integer (or less) 
                    differences (see Swarm manual at https://github.com/torognes/swarm 
                    for more details). [default: 1]" name="swarm_resolution" 
                    optional="True" type="integer"/>
                <param default="1" label="--threads: Specify number of threads (1 
                    thread per core) to be used for usearch61, sortmerna, sumaclust 
                    and swarm commands that utilize multithreading. [default: 1]" 
                    name="threads" optional="True" type="text"/>
            </when>
            <when value="uclust">
                <param default="0.97" value="0.97" label="-s/--similarity: Sequence 
                    similarity threshold (for blast, cdhit, uclust, uclust_ref, 
                    usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or 
                    sortmerna [default: 0.97]" name="similarity" optional="True" 
                    type="float"/>
                <param label="-z/--enable_rev_strand_match: Enable reverse strand 
                    matching for uclust, uclust_ref, usearch, usearch_ref, usearch61, 
                    or usearch61_ref otu picking, will double the amount of memory 
                    used. [default: False]" name="enable_rev_strand_match" selected="False" 
                    type="boolean"/>
                <param label="-A/--optimal_uclust: Pass the --optimal flag to uclust 
                    for uclust otu picking. [default: False]" name="optimal_uclust" 
                    selected="False" type="boolean"/>
                <param label="-D/--suppress_presort_by_abundance_uclust: Suppress 
                    presorting of sequences by abundance when picking OTUs with 
                    uclust or uclust_ref [default: False]" 
                    name="suppress_presort_by_abundance_uclust" selected="False" 
                    type="boolean"/>
                <param default="default" label="--max_accepts: max_accepts value 
                    to uclust, uclust_ref, usearch61, and usearch61_ref. By default, 
                    will use value suggested by method (uclust: 1, usearch61: 1) 
                    [default: default]" name="max_accepts" optional="True" 
                    type="text"/>
                <param default="default" label="--max_rejects: max_rejects value 
                    for uclust, uclust_ref, usearch61, and usearch61_ref.  With 
                    default settings, will use value recommended by clustering 
                    method used (uclust: 8, usearch61: 8 for usearch_fast_cluster 
                    option, 32 for reference and smallmem options) [default: default]" 
                    name="max_rejects" optional="True" type="text"/>
                <param default="8" label="--stepwords: stepwords value to uclust 
                    and uclust_ref [default: 8]" name="stepwords" optional="True" 
                    type="integer"/>
                <param label="-E/--exact_uclust: Pass the --exact flag to uclust 
                    for uclust otu picking. [default: False]" name="exact_uclust" 
                    selected="False" type="boolean"/>
            </when>
        </conditional>

        <param default="None" label="-n/--prefix_prefilter_length: Prefilter data 
            so seqs with identical first prefix_prefilter_length are automatically 
            grouped into a single OTU.  This is useful for large sequence collections 
            where OTU picking doesn't scale well [default: None; 100 is a good value]" 
            name="prefix_prefilter_length" optional="True" type="integer"/>
        <param default="50" label="-p/--prefix_length: Prefix length when using 
            the prefix_suffix otu picker; WARNING: CURRENTLY DIFFERENT FROM 
            prefix_prefilter_length (-n)! [default: 50]" name="prefix_length" 
            optional="True" type="integer"/>
        <param default="50" label="-u/--suffix_length: Suffix length when using 
            the prefix_suffix otu picker [default: 50]" name="suffix_length" 
            optional="True" type="integer"/>
        <param default="union" label="-F/--non_chimeras_retention: Selects subsets 
            of sequences detected as non-chimeras to retain after de novo and 
            reference based chimera detection.  Options are intersection or union.  
            union will retain sequences that are flagged as non-chimeric from either 
            filter, while intersection will retain only those sequences that are 
            flagged as non-chimeras from both detection methods. [default: union]" 
            name="non_chimeras_retention" optional="True" type="text"/>
    </inputs>
    <outputs>
        <data format="txt" from_work_dir="fastasplit/*_otus.txt" 
            name="pick_otus.txt" label="pick_otus.txt"/>
        <data format="txt" from_work_dir="fastasplit/*_otus.log" 
            name="pick_otus.log" label="pick_otus.log"/>
        <data format="txt" from_work_dir="fastasplit/*_failures.txt" 
            name="pick_otus_failures.txt" label="pick_otus_failures.txt"/>
    </outputs>

    <tests>
        <test>
        </test>
    </tests>

    <help><![CDATA[
**What it does**

The OTU picking step assigns similar sequences to operational taxonomic units, or OTUs, by clustering sequences based on a user-defined similarity threshold. Sequences which are similar at or above the threshold level are taken to represent the presence of a taxonomic unit (e.g., a genus, when the similarity threshold is set at 0.94) in the sequence collection.

Currently, the following clustering methods have been implemented in QIIME:

1.  uclust, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity.

2.  uclust_ref, as uclust, but takes a reference database to use as seeds.  New clusters can be toggled on or off.

3.  usearch, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity, filters low abundance clusters, performs de novo and reference based chimera detection.

4.  usearch_ref, as usearch, but takes a reference database to use as seeds.  New clusters can be toggled on or off.

5. sumaclust, creates &quot;seeds&quot; of sequences which generate clusters based on similarity threshold.

6. swarm, creates &quot;seeds&quot; of sequences which generate clusters based on a resolution threshold.


Chimera checking with usearch 6.X is implemented in identify_chimeric_seqs.py.  Chimera checking should be done first with usearch 6.X, and the filtered resulting fasta file can then be clustered.


The primary inputs for pick_otus.py are:

1. A FASTA file containing sequences to be clustered

2. An OTU threshold (default is 0.97, roughly corresponding to species-level OTUs);

3. The method to be applied for clustering sequences into OTUs.

pick_otus.py takes a standard fasta file as input.


The output consists of two files (i.e. seqs_otus.txt and seqs_otus.log). The .txt file is composed of tab-delimited lines, where the first field on each line corresponds to an (arbitrary) cluster identifier, and the remaining fields correspond to sequence identifiers assigned to that cluster. Sequence identifiers correspond to those provided in the input FASTA file.  Usearch (i.e. usearch quality filter) can additionally have log files for each intermediate call to usearch.

Example lines from the resulting .txt file:

=   ====    ====    ====
0   seq1    seq5
1   seq2
2   seq3
3   seq4    seq6    seq7
=   ====    ====    ====

This result implies that four clusters were created based on 7 input sequences. 
The first cluster (cluster id 0) contains two sequences, sequence ids seq1 and 
seq5; the second cluster (cluster id 1) contains one sequence, sequence id seq2; 
the third cluster (cluster id 2) contains one sequence, sequence id seq3, and the 
final cluster (cluster id 3) contains three sequences, sequence ids seq4, seq6, 
and seq7.

The resulting .log file contains a list of parameters passed to the pick_otus.py 
script along with the output location of the resulting .txt file.</help>
    ]]>
    </help>

    <citations>
        <expand macro="citations" />
        <citation type="doi">10.1093/bioinformatics/btv231</citation>
        <citation type="doi">10.1093/bioinformatics/btq461</citation>
        <citation type="doi">10.1093/bioinformatics/bts611</citation>
        <citation type="doi">10.7287/peerj.preprints.386v1/supp-1</citation>
    </citations>
</tool>