0
|
1 <tool id="cg_calldiff" name="calldiff(beta) 1.6" version="1.0.1">
|
|
2 <!--
|
|
3 This tool creates a GUI for the calldiff function of cgatools from Complete Genomics, Inc.
|
|
4 written 6-18-2012 by bcrain@completegenomics.com
|
|
5 updated 8-14-2012 by bcrain@completegenomics.com
|
|
6 -->
|
|
7
|
|
8 <description>compares two Complete Genomics variant files.</description>
|
|
9
|
|
10 <command>
|
|
11 <!-- print version of cgatools to STDOUT-->
|
|
12 cgatools | head -1;
|
|
13
|
|
14 <!-- print command lines to STDOUT-->
|
|
15 echo "cgatools calldiff --beta
|
|
16 --reference ${crr.fields.path}
|
|
17 --variantsA $data_sources.inputA
|
|
18 --variantsB $data_sources.inputB
|
|
19 $validation
|
|
20 $diploid
|
|
21 --locus-stats-column-count $column
|
|
22 --max-hypothesis-count $hypothesis
|
|
23 --output-prefix cg_
|
|
24 --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
|
|
25 #if $somatic.report6 == "SomaticOutput"
|
|
26 --genome-rootA $somatic.genomeA
|
|
27 --genome-rootB $somatic.genomeB
|
|
28 --calibration-root $somatic.calibration
|
|
29 #end if
|
|
30 ";
|
|
31
|
|
32 <!-- execute cgatools-->
|
|
33 cgatools calldiff --beta
|
|
34 --reference ${crr.fields.path}
|
|
35 --variantsA $data_sources.inputA
|
|
36 --variantsB $data_sources.inputB
|
|
37 $validation
|
|
38 $diploid
|
|
39 --locus-stats-column-count $column
|
|
40 --max-hypothesis-count $hypothesis
|
|
41 --output-prefix cg_
|
|
42 --reports `echo ${report1} ${report2} ${report3} ${report4} ${report5} ${somatic.report6} | sed 's/ */,/g'`
|
|
43 #if $somatic.report6 == "SomaticOutput"
|
|
44 --genome-rootA $somatic.genomeA
|
|
45 --genome-rootB $somatic.genomeB
|
|
46 --calibration-root $somatic.calibration
|
|
47 #end if
|
|
48 </command>
|
|
49
|
|
50 <outputs>
|
|
51 <data format="tabular" name="output1" from_work_dir="cg_SuperlocusOutput.tsv" label="${tool.name} SuperlocusOutput">
|
|
52 <filter>(report1 == 'SuperlocusOutput')</filter>
|
|
53 </data>
|
|
54 <data format="tabular" name="output2" from_work_dir="cg_SuperlocusStats.tsv" label="${tool.name} SuperlocusStats">
|
|
55 <filter>(report2 == 'SuperlocusStats')</filter>
|
|
56 </data>
|
|
57 <data format="tabular" name="output3" from_work_dir="cg_LocusOutput.tsv" label="${tool.name} LocusOutput">
|
|
58 <filter>(report3 == 'LocusOutput')</filter>
|
|
59 </data>
|
|
60 <data format="tabular" name="output4" from_work_dir="cg_LocusStats.tsv" label="${tool.name} LocusStats">
|
|
61 <filter>(report4 == 'LocusStats')</filter>
|
|
62 </data>
|
|
63 <data format="tabular" name="output5a" from_work_dir="cg_VariantsA.tsv" label="${tool.name} VariantsA">
|
|
64 <filter>(report5 == 'VariantOutput')</filter>
|
|
65 </data>
|
|
66 <data format="tabular" name="output5b" from_work_dir="cg_VariantsB.tsv" label="${tool.name} VariantsB">
|
|
67 <filter>(report5 == 'VariantOutput')</filter>
|
|
68 </data>
|
|
69 <data format="tabular" name="output6" from_work_dir="cg_SomaticOutput.tsv" label="${tool.name} SomaticOutput">
|
|
70 <filter>(somatic['report6'] == 'SomaticOutput')</filter>
|
|
71 </data>
|
|
72 </outputs>
|
|
73
|
|
74 <inputs>
|
|
75 <!--form field to select crr file-->
|
|
76 <param name="crr" type="select" label="Reference genome (.crr file)">
|
|
77 <options from_data_table="cg_crr_files" />
|
|
78 </param>
|
|
79
|
|
80 <!--conditional to select variant file input-->
|
|
81 <conditional name="data_sources">
|
|
82 <param name="data_source" type="select" label="Where are the input varfiles?">
|
|
83 <option value="in" selected="true">imported into Galaxy</option>
|
|
84 <option value="out">located outside Galaxy (data on server or mounted drive)</option>
|
|
85 </param>
|
|
86
|
|
87 <!--form field to select variant files-->
|
|
88 <when value="in">
|
|
89 <param name="inputA" type="data" format="cg_var" label="Var file A">
|
|
90 <validator type="dataset_ok_validator" />
|
|
91 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
92 metadata_name="dbkey" metadata_column="1"
|
|
93 message="cgatools is not currently available for this build."/>
|
|
94 </param>
|
|
95 <param name="inputB" type="data" format="cg_var" label="Var file B">
|
|
96 <validator type="dataset_ok_validator" />
|
|
97 <validator type="dataset_metadata_in_file" filename="cg_crr_files.loc"
|
|
98 metadata_name="dbkey" metadata_column="1"
|
|
99 message="cgatools is not currently available for this build."/>
|
|
100 </param>
|
|
101 </when>
|
|
102
|
|
103 <!--form field to enter input files-->
|
|
104 <when value="out">
|
|
105 <param name="inputA" type="text" label="Variant file A (/path/varfile)" size="300" help="Variant file can be compressed (gz, bz2), e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000/ASM/var-GS00000YYYY-ASM.tsv.bz2">
|
|
106 <validator type="empty_field" message="You must supply a var file"/>
|
|
107 </param>
|
|
108 <param name="inputB" type="text" label="Variant file B (/path/varfile)" size="300" help="Variant file can be compressed (gz, bz2), e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000/ASM/var-GS00000YYYY-ASM.tsv.bz2.">
|
|
109 <validator type="empty_field" message="You must supply a var file"/>
|
|
110 </param>
|
|
111 </when>
|
|
112 </conditional>
|
|
113
|
|
114 <!--other parameters-->
|
|
115 <param name="diploid" type="select" label="Use diploid variant model" help="Uses varScoreEAF instead of varScoreVAF in somatic score computations. Also, uses diploid variant model instead of variable allele mixture model.">
|
|
116 <option value="">no</option>
|
|
117 <option value="--diploid">yes</option>
|
|
118 </param>
|
|
119
|
|
120 <param name="column" type="integer" label="Number of columns for locus compare classification in the locus stats file (default 15)" value="15">
|
|
121 <validator type="empty_field" message="You must enter a value, the default is 15" />
|
|
122 </param>
|
|
123
|
|
124 <param name="hypothesis" type="integer" label="Maximum number of possible phasings to consider for a superlocus (default 32)" value="32">
|
|
125 <validator type="empty_field" message="You must enter a value, the default is 32" />
|
|
126 </param>
|
|
127
|
|
128 <param name="validation" type="select" label="Reference cover validation (default on)" help="Turns on/off validation that all bases of a chromosome are covered by calls of the variant file.">
|
|
129 <option value="">on</option>
|
|
130 <option value="--no-reference-cover-validation">off</option>
|
|
131 </param>
|
|
132
|
|
133 <!--form fields to select ooutput reports-->
|
|
134 <param name="report1" type="select" label="Create report SuperlocusOutput">
|
|
135 <option value="">no</option>
|
|
136 <option value="SuperlocusOutput">yes</option>
|
|
137 </param>
|
|
138 <param name="report2" type="select" label="Create report SuperlocusStats">
|
|
139 <option value="">no</option>
|
|
140 <option value="SuperlocusStats">yes</option>
|
|
141 </param>
|
|
142 <param name="report3" type="select" label="Create report LocusOutput">
|
|
143 <option value="">no</option>
|
|
144 <option value="LocusOutput">yes</option>
|
|
145 </param>
|
|
146 <param name="report4" type="select" label="Create report LocusStats">
|
|
147 <option value="">no</option>
|
|
148 <option value="LocusStats">yes</option>
|
|
149 </param>
|
|
150 <param name="report5" type="select" label="Create report VariantOutput" help="Both variant files annotated by comparison results. If the somatic output report is requested, file A is also annotated with the same score ranks as produced in that report.">
|
|
151 <option value="">no</option>
|
|
152 <option value="VariantOutput">yes</option>
|
|
153 </param>
|
|
154
|
|
155 <!--conditional to select somatic reports and related inputs-->
|
|
156 <conditional name="somatic">
|
|
157 <param name="report6" type="select" label="Create report SomaticOutput" help="This report can only be generated on local Galaxy instances. Report for the list of simple variations that are present only in file 'A', annotated with the score that indicates the probability of the variation being truly somatic. Note: generating this report slows calldiff by 10x-20x.">
|
|
158 <option value="">no</option>
|
|
159 <option value="SomaticOutput">yes</option>
|
|
160 </param>
|
|
161
|
|
162 <when value="SomaticOutput">
|
|
163 <param name="genomeA" type="text" size="300" label="Directory for genome A (/path/dir)" help="The 'A' genome directory, e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories.">
|
|
164 <validator type="empty_field" message="You must supply the genome root directory for this sample"/>
|
|
165 </param>
|
|
166 <param name="genomeB" type="text" size="300" label="Directory for genome B (/path/dir)" help="The 'B' genome directory, e.g. /harddrive/GS00000XXXX-DID/GS00000YYYY-ASM/GS00123-DNA_G01_2000; this directory is expected to contain ASM/REF and ASM/EVIDENCE subdirectories.">
|
|
167 <validator type="empty_field" message="You must supply the genome root directory for this sample"/>
|
|
168 </param>
|
|
169 <param name="calibration" type="text" size="300" label="Directory containing calibration data (/path/dir)" help="The directory containing calibration data. For example, there should exist a file calibration-root/0.0.0/metrics.tsv. Calibration data can be downloaded from ftp://ftp.completegenomics.com/ScoreCalibrationFiles/var-calibration-v1.tgz">
|
|
170 <validator type="empty_field" message="You must supply the directory containing the calibration data"/>
|
|
171 </param>
|
|
172 </when>
|
|
173 </conditional>
|
|
174
|
|
175 </inputs>
|
|
176
|
|
177 <help>
|
|
178
|
|
179 **What it does**
|
|
180
|
|
181 This tool uses cgatools calldiff to compare two Complete Genomics variant files.
|
|
182
|
|
183 **cgatools 1.6.0 Documentation**
|
|
184
|
|
185 Userguide: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-user-guide.pdf
|
|
186
|
|
187 Release notes: http://cgatools.sourceforge.net/docs/1.6.0/cgatools-release-notes.pdf
|
|
188
|
|
189 **Command line reference**::
|
|
190
|
|
191 COMMAND NAME
|
|
192 calldiff - Compares two Complete Genomics variant files.
|
|
193
|
|
194 DESCRIPTION
|
|
195 Compares two Complete Genomics variant files. Divides the genome up into
|
|
196 superloci of nearby variants, then compares the superloci. Also refines the
|
|
197 comparison to determine per-call or per-locus comparison results.
|
|
198
|
|
199 Comparison results are usually described by a semi-colon separated string,
|
|
200 one per allele. Each allele's comparison result is one of the following
|
|
201 classifications:
|
|
202
|
|
203 ref-identical The alleles of the two variant files are identical, and
|
|
204 they are consistent with the reference.
|
|
205 alt-identical The alleles of the two variant files are identical, and
|
|
206 they are inconsistent with the reference.
|
|
207 ref-consistent The alleles of the two variant files are consistent,
|
|
208 and they are consistent with the reference.
|
|
209 alt-consistent The alleles of the two variant files are consistent,
|
|
210 and they are inconsistent with the reference.
|
|
211 onlyA The alleles of the two variant files are inconsistent,
|
|
212 and only file A is inconsistent with the reference.
|
|
213 onlyB The alleles of the two variant files are inconsistent,
|
|
214 and only file B is inconsistent with the reference.
|
|
215 mismatch The alleles of the two variant files are inconsistent,
|
|
216 and they are both inconsistent with the reference.
|
|
217 phase-mismatch The two variant files would be consistent if the
|
|
218 hapLink field had been empty, but they are
|
|
219 inconsistent.
|
|
220 ploidy-mismatch The superlocus did not have uniform ploidy.
|
|
221
|
|
222 In some contexts, this classification is rolled up into a simplified
|
|
223 classification, which is one of "identical", "consistent", "onlyA",
|
|
224 "onlyB", or "mismatch".
|
|
225
|
|
226 A good place to start looking at the results is the superlocus-output file.
|
|
227 It has columns defined as follows:
|
|
228
|
|
229 SuperlocusId An identifier given to the superlocus.
|
|
230 Chromosome The name of the chromosome.
|
|
231 Begin The 0-based offset of the start of the superlocus.
|
|
232 End The 0-based offset of the base one past the end of the
|
|
233 superlocus.
|
|
234 Classification The match classification of the superlocus.
|
|
235 Reference The reference sequence.
|
|
236 AllelesA A semicolon-separated list of the alleles (one per
|
|
237 haplotype) for variant file A, for the phasing with the
|
|
238 best comparison result.
|
|
239 AllelesB A semicolon-separated list of the alleles (one per
|
|
240 haplotype) for variant file B, for the phasing with the
|
|
241 best comparison result.
|
|
242
|
|
243 The locus-output file contains, for each locus in file A and file B that is
|
|
244 not consistent with the reference, an annotated set of calls for the locus.
|
|
245 The calls are annotated with the following columns:
|
|
246
|
|
247 SuperlocusId The id of the superlocus containing the locus.
|
|
248 File The variant file (A or B).
|
|
249 LocusClassification The locus classification is determined by the
|
|
250 varType column of the call that is inconsistent
|
|
251 with the reference, concatenated with a
|
|
252 modifier that describes whether the locus is
|
|
253 heterozygous, homozygous, or contains no-calls.
|
|
254 If there is no one variant in the locus (i.e.,
|
|
255 it is heterozygous alt-alt), the locus
|
|
256 classification begins with "other".
|
|
257 LocusDiffClassification The match classification for the locus. This is
|
|
258 defined to be the best of the comparison of the
|
|
259 locus to the same region in the other file, or
|
|
260 the comparison of the superlocus.
|
|
261
|
|
262 The somatic output file contains a list of putative somatic variations of
|
|
263 genome A. The output includes only those loci that can be classified as
|
|
264 snp, del, ins or sub in file A, and are called reference in the file B.
|
|
265 Every locus is annotated with the following columns:
|
|
266
|
|
267 VarCvgA The totalReadCount from file A for this locus
|
|
268 (computed on the fly if file A is not a
|
|
269 masterVar file).
|
|
270 VarScoreA The varScoreVAF from file A, or varScoreEAF if
|
|
271 the "--diploid" option is used.
|
|
272 RefCvgB The maximum of the uniqueSequenceCoverage
|
|
273 values for the locus in genome B.
|
|
274 RefScoreB Minimum of the reference scores of the locus in
|
|
275 genome B.
|
|
276 SomaticCategory The category used for determining the
|
|
277 calibrated scores and the SomaticRank.
|
|
278 VarScoreACalib The calibrated variant score of file A, under
|
|
279 the model selected by using or not using the
|
|
280 "--diploid" option, and corrected for the count
|
|
281 of heterozygous variants observed in this
|
|
282 genome. See user guide for more information.
|
|
283 VarScoreBCalib The calibrated reference score of file B, under
|
|
284 the model selected by using or not using the
|
|
285 "--diploid" option, and corrected for the count
|
|
286 of heterozygous variants observed in this
|
|
287 genome. See user guide for more information.
|
|
288 SomaticRank The estimated rank of this somatic mutation,
|
|
289 amongst all true somatic mutations within this
|
|
290 SomaticCategory. The value is a number between
|
|
291 0 and 1; a value of 0.012 means, for example,
|
|
292 that an estimated 1.2% of the true somatic
|
|
293 mutations in this somaticCategory have a
|
|
294 somaticScore less than the somaticScore for
|
|
295 this mutation. See user guide for more
|
|
296 information.
|
|
297 SomaticScore An integer that provides a total order on
|
|
298 quality for all somatic mutations. It is equal
|
|
299 to -10*log10( P(false)/P(true) ), under the
|
|
300 assumption that this genome has a rate of
|
|
301 somatic mutation equal to 1/Mb for
|
|
302 SomaticCategory snp, 1/10Mb for SomaticCategory
|
|
303 ins, 1/10Mb for SomaticCategory del, and 1/20Mb
|
|
304 for SomaticCategory sub. The computation is
|
|
305 based on the assumptions described in the user
|
|
306 guide, and is affected by choice of variant
|
|
307 model selected by using or not using the
|
|
308 "--diploid" option.
|
|
309 SomaticQuality Equal to VQHIGH for all somatic mutations where
|
|
310 SomaticScore >= -10. Otherwise, this column is
|
|
311 empty.
|
|
312
|
|
313 OPTIONS
|
|
314 -h [ --help ]
|
|
315 Print this help message.
|
|
316
|
|
317 --reference arg
|
|
318 The input crr file.
|
|
319
|
|
320 --variantsA arg
|
|
321 The "A" input variant file.
|
|
322
|
|
323 --variantsB arg
|
|
324 The "B" input variant file.
|
|
325
|
|
326 --output-prefix arg
|
|
327 The path prefix for all output reports.
|
|
328
|
|
329 --reports arg (=SuperlocusOutput,SuperlocusStats,LocusOutput,LocusStats)
|
|
330 Comma-separated list of reports to generate. (Beware any reports whose
|
|
331 name begins with "Debug".) A report is one of:
|
|
332 SuperlocusOutput Report for superlocus classification.
|
|
333 SuperlocusStats Report for superlocus classification stats.
|
|
334 LocusOutput Report for locus classification.
|
|
335 LocusStats Report for locus stats.
|
|
336 VariantOutput Both variant files annotated by comparison
|
|
337 results.If the somatic output report is
|
|
338 requested, file A is also annotated with the
|
|
339 same score ranks as produced in that report.
|
|
340 SomaticOutput Report for the list of simple variations that
|
|
341 are present only in file "A", annotated with
|
|
342 the score that indicates the probability of
|
|
343 the variation being truly somatic. Requires
|
|
344 beta, genome-rootA, and genome-rootB options
|
|
345 to be provided as well. Note: generating this
|
|
346 report slows calldiff by 10x-20x.
|
|
347 DebugCallOutput Report for call classification.
|
|
348 DebugSuperlocusOutput Report for debug superlocus information.
|
|
349 DebugSomaticOutput Report for distribution estimates used for
|
|
350 somatic rescoring. Only produced if
|
|
351 SomaticOutput is also turned on.
|
|
352
|
|
353 --diploid
|
|
354 Uses varScoreEAF instead of varScoreVAF in somatic score computations.
|
|
355 Also, uses diploid variant model instead of variable allele mixture
|
|
356 model.
|
|
357
|
|
358 --locus-stats-column-count arg (=15)
|
|
359 The number of columns for locus compare classification in the locus
|
|
360 stats file.
|
|
361
|
|
362 --max-hypothesis-count arg (=32)
|
|
363 The maximum number of possible phasings to consider for a superlocus.
|
|
364
|
|
365 --no-reference-cover-validation
|
|
366 Turns off validation that all bases of a chromosome are covered by
|
|
367 calls of the variant file.
|
|
368
|
|
369 --genome-rootA arg
|
|
370 The "A" genome directory, for example /data/GS00118-DNA_A01; this
|
|
371 directory is expected to contain ASM/REF and ASM/EVIDENCE
|
|
372 subdirectories.
|
|
373
|
|
374 --genome-rootB arg
|
|
375 The "B" genome directory.
|
|
376
|
|
377 --calibration-root arg
|
|
378 The directory containing calibration data. For example, there should
|
|
379 exist a file calibration-root/0.0.0/metrics.tsv.
|
|
380
|
|
381 --beta
|
|
382 This flag enables the SomaticOutput report, which is beta
|
|
383 functionality.
|
|
384
|
|
385 SUPPORTED FORMAT_VERSION
|
|
386 0.3 or later
|
|
387 </help>
|
|
388 </tool>
|