comparison pick_otus.xml @ 0:c1bd0c560018 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/qiime commit bcbe76277f3e60303faf826f8ce7f018bc663a9a-dirty
author bebatut
date Tue, 02 Feb 2016 05:50:37 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c1bd0c560018
1 <tool id="qiime_pick_otus" name="pick otus" version="1.9.1galaxy1">
2
3 <description>OTU picking</description>
4
5 <macros>
6 <import>macros.xml</import>
7 </macros>
8
9 <expand macro="requirements" />
10
11 <command>
12 <![CDATA[
13 pick_otus.py
14 -i $input_seqs_filepath
15 -o fastasplit
16
17 #if str($methode.otu_picking_method) != 'None':
18 -m $methode.otu_picking_method
19 #end if
20
21 #if str($methode.otu_picking_method) in ("uclust_ref","usearch_ref") :
22 -r $methode.refseqs_fp
23 #end if
24
25 #if str($methode.otu_picking_method) in ("uclust","uclust_ref","usearch","usearch_ref","sumaclust") :
26 #if $methode.similarity:
27 -s $methode.similarity
28 #end if
29 #end if
30
31 #if str($methode.otu_picking_method) == "sumaclust":
32 #if $methode.sumaclust_exact:
33 --sumaclust_exact
34 #end if
35 #end if
36
37 #if str($methode.otu_picking_method) == "swarm":
38 #if $methode.swarm_resolution:
39 --swarm_resolution=$methode.swarm_resolution
40 #end if
41 #end if
42
43 #if str($methode.otu_picking_method) in ("uclust","uclust_ref","usearch","usearch_ref"):
44 #if $methode.enable_rev_strand_match:
45 -z
46 #end if
47 #if str($methode.max_accepts):
48 --max_accepts=$methode.max_accepts
49 #end if
50
51 #if str($methode.max_rejects):
52 --max_rejects=$methode.max_rejects
53 #end if
54 #end if
55
56 #if str($methode.otu_picking_method) in ("uclust","uclust_ref"):
57 #if $methode.stepwords:
58 --stepwords=$methode.stepwords
59 #end if
60 #if $methode.suppress_presort_by_abundance_uclust:
61 -D
62 #end if
63 #end if
64
65 #if str($methode.otu_picking_method) == "uclust":
66 #if $methode.optimal_uclust:
67 -A
68 #end if
69 #if $methode.exact_uclust:
70 -E
71 #end if
72 #end if
73
74 #if str($methode.otu_picking_method) == "usearch":
75 #if $methode.percent_id_err:
76 -j $methode.percent_id_err
77 #end if
78 #if $methode.abundance_skew:
79 -a $methode.abundance_skew
80 #end if
81 #if str($methode.db_filepath) != 'None':
82 -f $methode.db_filepath
83 #end if
84 #if $methode.perc_id_blast:
85 --perc_id_blast=$methode.perc_id_blast
86 #end if
87 #if $methode.suppress_de_novo_chimera_detection:
88 -k
89 #end if
90 #end if
91
92 #if str($methode.otu_picking_method) in ("sumaclust","swarm"):
93 #if str($methode.threads):
94 --threads=$methode.threads
95 #end if
96 #end if
97
98 #if $prefix_prefilter_length:
99 -n $prefix_prefilter_length
100 #end if
101
102 #if $prefix_length:
103 -p $prefix_length
104 #end if
105
106 #if $suffix_length:
107 -u $suffix_length
108 #end if
109
110 #if str($non_chimeras_retention):
111 -F $non_chimeras_retention
112 #end if
113 ]]>
114 </command>
115
116 <inputs>
117 <param label="-i/--input_seqs_filepath: Path to input sequences file"
118 name="input_seqs_filepath" optional="False" type="data"/>
119 <conditional name="methode">
120 <param label="-m/--otu_picking_method: Method for picking OTUs. Valid
121 choices are: sortmerna, mothur, trie, uclust_ref, usearch, usearch_ref,
122 blast, usearch61, usearch61_ref, sumaclust, swarm, prefix_suffix,
123 cdhit, uclust. The mothur method requires an input file of aligned
124 sequences. usearch will enable the usearch quality filtering
125 pipeline. [default: uclust]" name="otu_picking_method"
126 optional="FALSE" type="select">
127 <option selected="True" value="uclust">uclust</option>
128 <option value="uclust_ref">uclust_ref</option>
129 <option value="usearch">usearch</option>
130 <option value="usearch_ref">usearch_ref</option>
131 <option value="sumaclust">sumaclust</option>
132 <option value="swarm">swarm</option>
133 </param>
134 <when value="uclust_ref">
135 <param default="/home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta"
136 label="-r/--refseqs_fp: Path to reference sequences to search
137 against when using -m uclust_ref, -m usearch_ref [default:
138 /home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta]" name="refseqs_fp" optional="True" type="data"/>
139 <param default="0.97" label="-s/--similarity: Sequence similarity
140 threshold (for blast, cdhit, uclust, uclust_ref, usearch,
141 usearch_ref, usearch61, usearch61_ref, sumaclust or sortmerna
142 [default: 0.97]" name="similarity" optional="True" type="float"/>
143 <param label="-z/--enable_rev_strand_match: Enable reverse strand
144 matching for uclust, uclust_ref, usearch, usearch_ref, usearch61,
145 or usearch61_ref otu picking, will double the amount of memory
146 used. [default: False]" name="enable_rev_strand_match"
147 selected="False" type="boolean"/>
148 <param label="-D/--suppress_presort_by_abundance_uclust: Suppress
149 presorting of sequences by abundance when picking OTUs with
150 uclust or uclust_ref [default: False]"
151 name="suppress_presort_by_abundance_uclust" selected="False"
152 type="boolean"/>
153 <param label="-C/--suppress_new_clusters: Suppress creation of new
154 clusters using seqs that don't match reference when using -m
155 uclust_ref, -m usearch61_ref, or -m usearch_ref [default: False]"
156 name="suppress_new_clusters" selected="False" type="boolean"/>
157 <param default="default" label="--max_accepts: max_accepts value
158 to uclust, uclust_ref, usearch61, and usearch61_ref. By default,
159 will use value suggested by method (uclust: 1, usearch61: 1)
160 [default: default]" name="max_accepts" optional="True" type="text"/>
161 <param default="default" label="--max_rejects: max_rejects value
162 for uclust, uclust_ref, usearch61, and usearch61_ref. With
163 default settings, will use value recommended by clustering
164 method used (uclust: 8, usearch61: 8 for usearch_fast_cluster
165 option, 32 for reference and smallmem options) [default:
166 default]" name="max_rejects" optional="True" type="text"/>
167 <param default="8" label="--stepwords: stepwords value to uclust
168 and uclust_ref [default: 8]" name="stepwords" optional="True"
169 type="integer"/>
170 </when>
171 <when value="usearch_ref">
172 <param default="/home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta"
173 label="-r/--refseqs_fp: Path to reference sequences to search
174 against when using -m blast, -m sortmerna, -m uclust_ref, -m
175 usearch_ref, or -m usearch61_ref [default:
176 /home12/caparmor/bioinfo/softs/sources/Qiime/qiime-1.9.0/python_modules/lib/python2.7/site-packages/qiime_default_reference-0.1.1-py2.7.egg/qiime_default_reference/gg_13_8_otus/rep_set/97_otus.fasta]" name="refseqs_fp" optional="True" type="data"/>
177 <param default="0.97" label="-s/--similarity: Sequence similarity
178 threshold (for blast, cdhit, uclust, uclust_ref, usearch,
179 usearch_ref, usearch61, usearch61_ref, sumaclust or sortmerna
180 [default: 0.97]" name="similarity" optional="True" type="float"/>
181 <param label="-z/--enable_rev_strand_match: Enable reverse strand
182 matching for uclust, uclust_ref, usearch, usearch_ref, usearch61,
183 or usearch61_ref otu picking, will double the amount of memory
184 used. [default: False]" name="enable_rev_strand_match" selected="False"
185 type="boolean"/>
186 <param label="-C/--suppress_new_clusters: Suppress creation of new
187 clusters using seqs that don't match reference when using -m
188 uclust_ref, -m usearch61_ref, or -m usearch_ref [default: False]"
189 name="suppress_new_clusters" selected="False" type="boolean"/>
190 <param default="default" label="--max_accepts: max_accepts value
191 to uclust, uclust_ref, usearch61, and usearch61_ref. By default,
192 will use value suggested by method (uclust: 1, usearch61: 1)
193 [default: default]" name="max_accepts" optional="True" type="text"/>
194 <param default="default" label="--max_rejects: max_rejects value
195 for uclust, uclust_ref, usearch61, and usearch61_ref. With
196 default settings, will use value recommended by clustering method
197 used (uclust: 8, usearch61: 8 for usearch_fast_cluster option,
198 32 for reference and smallmem options) [default: default]"
199 name="max_rejects" optional="True" type="text"/>
200 </when>
201 <when value="usearch">
202 <param default="0.97" value="0.97" label="-s/--similarity: Sequence
203 similarity threshold (for blast, cdhit, uclust, uclust_ref,
204 usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or
205 sortmerna [default: 0.97]" name="similarity" optional="True"
206 type="float"/>
207 <param label="-z/--enable_rev_strand_match: Enable reverse strand
208 matching for uclust, uclust_ref, usearch, usearch_ref, usearch61,
209 or usearch61_ref otu picking, will double the amount of memory
210 used. [default: False]" name="enable_rev_strand_match"
211 selected="False" type="boolean"/>
212 <param default="default" label="--max_accepts: max_accepts value
213 to uclust, uclust_ref, usearch61, and usearch61_ref. By
214 default, will use value suggested by method (uclust: 1,
215 usearch61: 1) [default: default]" name="max_accepts" optional="True"
216 type="text"/>
217 <param default="default" label="--max_rejects: max_rejects value
218 for uclust, uclust_ref, usearch61, and usearch61_ref. With
219 default settings, will use value recommended by clustering
220 method used (uclust: 8, usearch61: 8 for usearch_fast_cluster
221 option, 32 for reference and smallmem options) [default: default]"
222 name="max_rejects" optional="True" type="text"/>
223 <param default="0.97" value="0.97" label="-j/--percent_id_err:
224 Percent identity threshold for cluster error detection with
225 usearch, expressed as a fraction between 0 and 1. [default: 0.97]"
226 name="percent_id_err" optional="True" type="float"/>
227 <param default="2.0" label="-a/--abundance_skew: Abundance skew
228 setting for de novo chimera detection with usearch. [default: 2.0]"
229 name="abundance_skew" optional="True" type="float"/>
230 <param default="None" label="-f/--db_filepath: Reference database
231 of fasta sequences for reference based chimera detection with
232 usearch. [default: None]" name="db_filepath" optional="True"
233 type="data"/>
234 <param default="0.97" value="0.97" label="--perc_id_blast: Percent
235 ID for mapping OTUs created by usearch back to original sequence
236 IDs [default: 0.97]" name="perc_id_blast" optional="True"
237 type="float"/>
238 <param label="-k/--suppress_de_novo_chimera_detection: Suppress
239 de novo chimera detection in usearch. [default: False]"
240 name="suppress_de_novo_chimera_detection" selected="False"
241 type="boolean"/>
242 <param label="--usearch_fast_cluster: Use fast clustering option
243 for usearch or usearch61_ref with new clusters.
244 --enable_rev_strand_match can not be enabled with this option,
245 and the only valid option for usearch61_sort_method is 'length'.
246 This option uses more memory than the default option for de novo
247 clustering. [default: False]" name="usearch_fast_cluster"
248 selected="False" type="boolean"/>
249 </when>
250 <when value="sumaclust">
251 <param default="0.97" value="0.97" label="-s/--similarity: Sequence
252 similarity threshold (for blast, cdhit, uclust, uclust_ref,
253 usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or
254 sortmerna [default: 0.97]" name="similarity" optional="True"
255 type="float"/>
256 <param label="--sumaclust_exact: A sequence is assigned to the best
257 matching seed rather than the first matching seed passing the
258 similarity threshold [default: False]" name="sumaclust_exact"
259 selected="False" type="boolean"/>
260 <param default="1" label="--threads: Specify number of threads (1
261 thread per core) to be used for usearch61, sortmerna, sumaclust and swarm commands that utilize multithreading. [default: 1]" name="threads" optional="True" type="text"/>
262 </when>
263 <when value="swarm">
264 <param default="1" label="--swarm_resolution: Maximum number of
265 differences allowed between two amplicons, meaning that two
266 amplicons will be grouped if they have integer (or less)
267 differences (see Swarm manual at https://github.com/torognes/swarm
268 for more details). [default: 1]" name="swarm_resolution"
269 optional="True" type="integer"/>
270 <param default="1" label="--threads: Specify number of threads (1
271 thread per core) to be used for usearch61, sortmerna, sumaclust
272 and swarm commands that utilize multithreading. [default: 1]"
273 name="threads" optional="True" type="text"/>
274 </when>
275 <when value="uclust">
276 <param default="0.97" value="0.97" label="-s/--similarity: Sequence
277 similarity threshold (for blast, cdhit, uclust, uclust_ref,
278 usearch, usearch_ref, usearch61, usearch61_ref, sumaclust or
279 sortmerna [default: 0.97]" name="similarity" optional="True"
280 type="float"/>
281 <param label="-z/--enable_rev_strand_match: Enable reverse strand
282 matching for uclust, uclust_ref, usearch, usearch_ref, usearch61,
283 or usearch61_ref otu picking, will double the amount of memory
284 used. [default: False]" name="enable_rev_strand_match" selected="False"
285 type="boolean"/>
286 <param label="-A/--optimal_uclust: Pass the --optimal flag to uclust
287 for uclust otu picking. [default: False]" name="optimal_uclust"
288 selected="False" type="boolean"/>
289 <param label="-D/--suppress_presort_by_abundance_uclust: Suppress
290 presorting of sequences by abundance when picking OTUs with
291 uclust or uclust_ref [default: False]"
292 name="suppress_presort_by_abundance_uclust" selected="False"
293 type="boolean"/>
294 <param default="default" label="--max_accepts: max_accepts value
295 to uclust, uclust_ref, usearch61, and usearch61_ref. By default,
296 will use value suggested by method (uclust: 1, usearch61: 1)
297 [default: default]" name="max_accepts" optional="True"
298 type="text"/>
299 <param default="default" label="--max_rejects: max_rejects value
300 for uclust, uclust_ref, usearch61, and usearch61_ref. With
301 default settings, will use value recommended by clustering
302 method used (uclust: 8, usearch61: 8 for usearch_fast_cluster
303 option, 32 for reference and smallmem options) [default: default]"
304 name="max_rejects" optional="True" type="text"/>
305 <param default="8" label="--stepwords: stepwords value to uclust
306 and uclust_ref [default: 8]" name="stepwords" optional="True"
307 type="integer"/>
308 <param label="-E/--exact_uclust: Pass the --exact flag to uclust
309 for uclust otu picking. [default: False]" name="exact_uclust"
310 selected="False" type="boolean"/>
311 </when>
312 </conditional>
313
314 <param default="None" label="-n/--prefix_prefilter_length: Prefilter data
315 so seqs with identical first prefix_prefilter_length are automatically
316 grouped into a single OTU. This is useful for large sequence collections
317 where OTU picking doesn't scale well [default: None; 100 is a good value]"
318 name="prefix_prefilter_length" optional="True" type="integer"/>
319 <param default="50" label="-p/--prefix_length: Prefix length when using
320 the prefix_suffix otu picker; WARNING: CURRENTLY DIFFERENT FROM
321 prefix_prefilter_length (-n)! [default: 50]" name="prefix_length"
322 optional="True" type="integer"/>
323 <param default="50" label="-u/--suffix_length: Suffix length when using
324 the prefix_suffix otu picker [default: 50]" name="suffix_length"
325 optional="True" type="integer"/>
326 <param default="union" label="-F/--non_chimeras_retention: Selects subsets
327 of sequences detected as non-chimeras to retain after de novo and
328 reference based chimera detection. Options are intersection or union.
329 union will retain sequences that are flagged as non-chimeric from either
330 filter, while intersection will retain only those sequences that are
331 flagged as non-chimeras from both detection methods. [default: union]"
332 name="non_chimeras_retention" optional="True" type="text"/>
333 </inputs>
334 <outputs>
335 <data format="txt" from_work_dir="fastasplit/*_otus.txt"
336 name="pick_otus.txt" label="pick_otus.txt"/>
337 <data format="txt" from_work_dir="fastasplit/*_otus.log"
338 name="pick_otus.log" label="pick_otus.log"/>
339 <data format="txt" from_work_dir="fastasplit/*_failures.txt"
340 name="pick_otus_failures.txt" label="pick_otus_failures.txt"/>
341 </outputs>
342
343 <tests>
344 <test>
345 </test>
346 </tests>
347
348 <help><![CDATA[
349 **What it does**
350
351 The OTU picking step assigns similar sequences to operational taxonomic units, or OTUs, by clustering sequences based on a user-defined similarity threshold. Sequences which are similar at or above the threshold level are taken to represent the presence of a taxonomic unit (e.g., a genus, when the similarity threshold is set at 0.94) in the sequence collection.
352
353 Currently, the following clustering methods have been implemented in QIIME:
354
355 1. uclust, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity.
356
357 2. uclust_ref, as uclust, but takes a reference database to use as seeds. New clusters can be toggled on or off.
358
359 3. usearch, creates &quot;seeds&quot; of sequences which generate clusters based on percent identity, filters low abundance clusters, performs de novo and reference based chimera detection.
360
361 4. usearch_ref, as usearch, but takes a reference database to use as seeds. New clusters can be toggled on or off.
362
363 5. sumaclust, creates &quot;seeds&quot; of sequences which generate clusters based on similarity threshold.
364
365 6. swarm, creates &quot;seeds&quot; of sequences which generate clusters based on a resolution threshold.
366
367
368 Chimera checking with usearch 6.X is implemented in identify_chimeric_seqs.py. Chimera checking should be done first with usearch 6.X, and the filtered resulting fasta file can then be clustered.
369
370
371 The primary inputs for pick_otus.py are:
372
373 1. A FASTA file containing sequences to be clustered
374
375 2. An OTU threshold (default is 0.97, roughly corresponding to species-level OTUs);
376
377 3. The method to be applied for clustering sequences into OTUs.
378
379 pick_otus.py takes a standard fasta file as input.
380
381
382 The output consists of two files (i.e. seqs_otus.txt and seqs_otus.log). The .txt file is composed of tab-delimited lines, where the first field on each line corresponds to an (arbitrary) cluster identifier, and the remaining fields correspond to sequence identifiers assigned to that cluster. Sequence identifiers correspond to those provided in the input FASTA file. Usearch (i.e. usearch quality filter) can additionally have log files for each intermediate call to usearch.
383
384 Example lines from the resulting .txt file:
385
386 = ==== ==== ====
387 0 seq1 seq5
388 1 seq2
389 2 seq3
390 3 seq4 seq6 seq7
391 = ==== ==== ====
392
393 This result implies that four clusters were created based on 7 input sequences.
394 The first cluster (cluster id 0) contains two sequences, sequence ids seq1 and
395 seq5; the second cluster (cluster id 1) contains one sequence, sequence id seq2;
396 the third cluster (cluster id 2) contains one sequence, sequence id seq3, and the
397 final cluster (cluster id 3) contains three sequences, sequence ids seq4, seq6,
398 and seq7.
399
400 The resulting .log file contains a list of parameters passed to the pick_otus.py
401 script along with the output location of the resulting .txt file.</help>
402 ]]>
403 </help>
404
405 <citations>
406 <expand macro="citations" />
407 <citation type="doi">10.1093/bioinformatics/btv231</citation>
408 <citation type="doi">10.1093/bioinformatics/btq461</citation>
409 <citation type="doi">10.1093/bioinformatics/bts611</citation>
410 <citation type="doi">10.7287/peerj.preprints.386v1/supp-1</citation>
411 </citations>
412 </tool>