comparison gottcha.xml @ 0:2569a83977f5 draft

planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/gottcha commit 5d24210279e26623ae6c98f7551e3565fdc9bc48
author jdv
date Mon, 30 Jan 2017 19:07:21 -0500
parents
children 87efdde6105f
comparison
equal deleted inserted replaced
-1:000000000000 0:2569a83977f5
1 <tool id="gottcha" name="GOTTCHA" version="0.0.1">
2
3 <description>Read-based metagenome characterization</description>
4
5 <!-- ***************************************************************** -->
6
7 <requirements>
8 <requirement type="package" version="1.0b-564cf3b">gottcha</requirement>
9 </requirements>
10
11 <!-- ***************************************************************** -->
12
13 <version_command>gottcha.pl -h | perl -wnE'print "$1\n" for /VERSION: (\S+)/g'</version_command>
14
15 <!-- ***************************************************************** -->
16
17 <command detect_errors="aggressive">
18 <![CDATA[
19
20 gottcha.pl
21
22 --input '${fn_in}'
23 --database '${db.fields.path}'
24 --threads '\${GALAXY_SLOTS:-1}'
25 --outdir './'
26 --prefix results
27
28 ##--General Options------------------------------
29
30 --relAbu ${general.rel_abund}
31 --mode ${general.output_full}
32 ${general.filt_plasmid}
33
34 ##--Split-trim Options---------------------------
35
36 --minQ ${split.min_qual}
37 --fixL ${split.fixed_len}
38 --ascii ${split.qual_offset}
39
40 ##--Filtering Options----------------------------
41
42 --minCov ${filter.min_cov}
43 --minMLHL ${filter.min_mlhl}
44 --cCov ${filter.c_cov}
45 --minLen ${filter.min_len}
46 --minHits ${filter.min_hits}
47
48 ]]>
49 </command>
50
51 <!-- ***************************************************************** -->
52
53 <inputs>
54
55 <param name="fn_in" type="data" format="fastq" label="Input reads" help="--input" />
56 <param name="db" type="select" label="Select a reference database" help="--database">
57 <options from_data_table="gottcha_indices">
58 <filter type="sort_by" column="2"/>
59 <validator type="no_options" message="No indexes are available for the selected input dataset"/>
60 </options>
61 </param>
62
63 <section name="general" title="General Options" expanded="True">
64 <param name="rel_abund" type="select" label="Abundance field" help="--relAbu">
65 <option value="LINEAR_DOC" selected="true">Linear DOC</option>
66 <option value="LINEAR_LENGTH">Linear length</option>
67 <option value="TOTAL_BP_MAPPED">Total bp mapped</option>
68 <option value="HIT_COUNT">Hit count</option>
69 </param>
70 <param name="output_full" type="boolean" truevalue="full" falsevalue="summary" checked="no" label="Output full report" help="--mode full" />
71 <param name="filt_plasmid" type="boolean" truevalue="--noPlasmidHit" falsevalue="" checked="no" label="Filter plasmid hits" help="If true, ignore alignments to plasmids" />
72 </section>
73
74 <section name="split" title="Split-trim Options" expanded="False">
75 <param name="min_qual" size="4" type="integer" value="20" min="0" max="41" label="Minimum quality" help="Minimum quality for a read to be considered valid (0-41)" />
76 <param name="fixed_len" size="4" type="integer" value="30" min="1" label="Trim length" help="Fixed length to which each read will be trimmed" />
77 <param name="qual_offset" type="select" label="Quality offset" help="Base call quality offset for ASCII encoding">
78 <option value="33" selected="true">33</option>
79 <option value="64">64</option>
80 </param>
81 </section>
82
83 <section name="filter" title="Filtering Options" expanded="False">
84 <param name="min_cov" size="5" type="float" value="0.005" min="0" label="Minimum coverage" help="Minimum linear coverage to be considered valid in abundance calculation" />
85 <param name="min_mlhl" size="4" type="integer" value="5" min="0" label="Minimum MLHL" help="Minimum mean-linear-hit-length to be considered valid in abundance calculations" />
86 <param name="c_cov" size="5" type="float" value="0.006" min="0" label="Critical coverage for MLHL" help="Critical coverage below which Minimum MLHL will cause an organism to fail" />
87 <param name="min_len" size="4" type="integer" value="100" min="0" label="Minimum length" help="Minimum unique length to be considered valid in abundance calculation" />
88 <param name="min_hits" size="4" type="integer" value="10" min="0" label="Minimum hits" help="Minimum number of hits to be considered valid in abundance calculation" />
89 </section>
90
91 </inputs>
92
93 <!-- ***************************************************************** -->
94
95 <outputs>
96
97 <data name="out_log" format="txt" label="GOTTCHA on ${on_string}: Log" from_work_dir="results.gottcha.log" />
98 <data name="out_tsv" format="txt" label="GOTTCHA on ${on_string}: Summary" from_work_dir="results.gottcha.tsv" />
99 <data name="out_full" format="txt" label="GOTTCHA on ${on_string}: Full Report" from_work_dir="results.gottcha_full.tsv">
100 <filter>output_full</filter>
101 </data>
102
103 </outputs>
104
105 <!-- ***************************************************************** -->
106
107 <tests>
108 <test>
109 <param name="db" value="test_db" />
110 <param name="fn_in" ftype="fastq" value="test.fq" />
111 <param name="output_full" value="no" />
112 <param name="min_hits" value="1" />
113 <output name="out_tsv" file="test_02.tsv" />
114 <output name="out_log" file="test_02.log" compare="sim_size" delta="2000"/>
115 </test>
116 </tests>
117
118 <!-- ***************************************************************** -->
119
120 <help>
121 <![CDATA[
122
123 .. class:: infomark
124
125 Description
126 --------------------
127
128 Genomic Origin Through Taxonomic CHAllenge (GOTTCHA) is an
129 annotation-independent and signature-based metagenomic taxonomic profiling
130 tool that has significantly smaller FDR than other profiling tools. This Perl
131 script is a wrapper to run the GOTTCHA profiling tool with pre-computed
132 signature databases. The procedure includes 3 major steps: split-trimming the
133 input data, mapping reads to a GOTTCHA database using BWA, profiling/filtering
134 the result.
135
136 Options
137 --------------------
138 ::
139
140 --relAbu|r <STRING> The field will be used to calculate relative
141 abundance. You can specify one of the following
142 fields: "LINEAR_LENGTH", "TOTAL_BP_MAPPED",
143 "HIT_COUNT", "LINEAR_DOC".
144 [default: LINEAR_DOC]
145 --mode|m <STRING> You can specify one of the output mode:
146 "summary" : this mode will report a summary of
147 profiling result to *.gottcha.tsv file.
148 "full" : other than a summary, this mode will
149 report unfiltered result to
150 *.gottcha_full.tsv with more detail.
151 "all" : other than two tables, this mode will
152 keep all output files that were
153 generated by each profiling step.
154 [default: summary]
155 --noPlasmidHit|n Ignore alignments that hit to plasmids
156 [default: null]
157
158 *** OPTIONS FOR SPLIT-TRIMMING READS ***
159
160 --minQ <INT> Minimum quality for a read to be considered valid
161 (0-41) [default: 20]
162 --fixL <INT> Fixed length to which each trimmed read will be cut
163 down to [default: 30]
164 --ascii <INT> ASCII encoding of quality score (33 or 64) [default:
165 33]
166
167 *** OPTIONS FOR FILTERING PROFILING RESULT ***
168
169 --minCov <FLOAT> Minimum linear coverage to be considered valid in
170 abundance calculation [default: 0.005]
171 --minMLHL <INT> Minimum Mean-Linear-Hit-Length to be considered valid
172 in abundance calculation [default: 5]
173 --cCov <FLOAT> Critical coverage below which --minMLHL will cause an
174 organism to fail [default: 0.006]
175 --minLen <INT> Minimum unique length to be considered valid in
176 abundance calculation [default: 100]
177 --minHits <INT> Minimum number of hits to be considered valid in
178 abundance calculation [10]
179
180 Interpreting Results
181 --------------------
182
183 GOTTCHA reports profiling results in a neat summary table
184 by default. The tsv file will list the organism(s) at all taxonomic
185 levels from STRAIN to PHYLUM, their linear length, total bases mapped,
186 linear depth of coverage, and the normalized linear depth of coverage. The
187 linear depth of coverage (LINEAR_DOC) is used to calculate relative
188 abundance of each organism or taxonomic name in the sample.
189
190 Summary table:
191
192 ================= ==============================
193 Column Description
194 ================= ==============================
195 LEVEL taxonomic rank
196 NAME taxonomic name
197 REL_ABUNDANCE relative abundance (equivalent to NORM_COV by default)
198 LINEAR_LENGTH number of non-overlapping bases covering the signatures
199 TOTAL_BP_MAPPED sum total of all hit lengths recruited to signatures
200 HIT_COUNT number of hits recruited to signatures
201 HIT_COUNT_PLASMID number of hits recruited to signatures
202 READ_COUNT number of reads recruited to signatures
203 LINEAR_DOC linear depth-of-coverage (TOTAL_BP_MAPPED / LINEAR_LENGTH)
204 NORM_COV normalized linear depth-of-coverage (LINEAR_DOC / SUM(LINEAR_DOC in certain level))
205 ================= ==============================
206
207 Other than a summary table, "full" report mode will report a table with more
208 detail information from unfiltered results. The explanation of each column in
209 the full report is as follows:
210
211 ================================== ==========================
212 Column Description
213 ================================== ==========================
214 RANKNAME (REPLICON) = replicon name (source + plasmid/chr)<br>(STRAIN) = strain name<br>(SPECIES) = species name<br>(GENUS) = genus name<br>...
215 NUM_SUBRANKS no. of distinct subranks for the current rank<br>(E.g. the no. of SPECIES under the current GENUS)
216 GPROJ_ENTRIES no. of genome projects (i.e. STRAINS) under this RANK NAME
217 LINEAR_LENGTH N/O_LENGTH<br>= non-overlapping length <br>= no. of non-overlapping bases covering the unique DB
218 UNIQUE_DB_LENGTH no. of unique bases for this organism
219 FULL_REFDB_LENGTH no. of bases in full reference
220 LINEAR_COV LINEAR_LENGTH / UNIQUE_DB_LENGTH
221 HIT_COUNT no. of hits recruited to genome
222 HIT_COUNT_PLASMID no. of hits recruited to plasmid
223 READ_COUNT no. of reads recruited to genome
224 FULL_HIT_COUNT no. of full-length read hits recruited to genome
225 TOTAL_BP_MAPPED sum total of all hit lengths recruited to genome<br>= hit1.length + hit2.length + ... hitX.length<br>[formerly FOLD_COV_UNIQUE_SAMPLE]
226 LINEAR_DOC linear depth-of-coverage<br>= fold coverage of sample's LINEAR_LENGTH <br>= TOTAL_BP_MAPPED / LINEAR_LENGTH<br>[formerly FOLD_COV_UNIQUE_REFDB]
227 UREF_DOC unique reference's depth-of-coverage<br>= fold coverage of reference's UNIQUE_DB_LENGTH<br>= TOTAL_BP_MAPPED / UNIQUE_DB_LENGTH
228 UREF_CMAX MAX COVERAGE OF REFDB POSSIBLE, GIVEN SAMPLE INPUT BASES<br>= Cmax = L0/l0 <br>= TOTAL_INPUT_BASES / UNIQUE_DB_LENGTH
229 FRAC_HITS_POSSIBLE HIT_COUNT / TOTAL_INPUT_READS
230 FRAC_BASES_POSSIBLE TOTAL_BP_MAPPED / TOTAL_INPUT_BASES
231 MEAN_HIT_LENGTH TOTAL_BP_MAPPED / HIT_COUNT
232 MEAN_LINEAR_HIT_LENGTH LINEAR_LENGTH / HIT_COUNT
233 best_SUBRANK name of the best subrank (determined by the highest LINEAR_COV)
234 best_NUM_SUBRANKS no. of subranks supporting current "SUBRANK"<br>{SS} = no. of GI entries supporting this strain<br>{S} = no. of strains supporting this species<br>{G} = no. of species supporting this genus<br>{F} = no. of genera supporting this family<br>{O} = no. of families supporting this order <br>{C} = no. of orders supporting this class<br>{P} = no. of classes supporting this phylum
235 best_GPROJ_ENTRIES no. of genome projects (i.e. STRAINS) under this best_SUBRANK<br>{SS} = no. of genome projects supporting this strain = 1<br>{S} = no. of genome projects supporting this species<br>{G} = no. of genome projects supporting this genus<br>{F} = no. of genome projects supporting this family<br>{O} = no. of genome projects supporting this order<br>{C} = no. of genome projects supporting this class<br>{P} = no. of genome projects supporting this phylum
236 best_LINEAR_LENGTH
237 best_UNIQUE_DB_LENGTH
238 best_FULL_REFDB_LENGTH
239 best_LINEAR_COV
240 best_HIT_COUNT
241 best_FULL_HIT_COUNT
242 best_TOTAL_BP_MAPPED
243 best_LINEAR_DOC (a.k.a. Abundance)
244 best_UREF_DOC
245 best_UREF_CMAX
246 best_FRAC_HITS_POSSIBLE
247 best_FRAC_BASES_POSSIBLE
248 best_MEAN_HIT_LENGTH
249 best_MEAN_LINEAR_HIT_LENGTH
250 CONTIG_COUNT no. of contiguous fragments<br> (after mapping & generating non-overlapping fragments)
251 CONTIG_MEAN_LEN mean length of contigs (bp)
252 CONTIG_STDEV_LEN standard deviation of contig lengths (bp)
253 CONTIG_MINLEN length of smallest contig
254 CONTIG_MAXLEN length of largest contig
255 CONTIG_HISTOGRAM(LEN:FREQ) contig Length Histogram<br> (in the format contigLength:frequency)
256 ================================== ==========================
257
258 ]]>
259 </help>
260
261 <!-- ***************************************************************** -->
262
263 <citations>
264 <citation type="doi">10.1093/nar/gkv180</citation>
265 </citations>
266
267 </tool>