comparison dada2_learnErrors.xml @ 3:10141f4eaae9 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit 5b1603bbcd3f139cad5c876be83fcb39697b5613-dirty
author matthias
date Mon, 29 Apr 2019 08:56:03 -0400
parents 57eb7437f646
children 9aeea74a1fc9
comparison
equal deleted inserted replaced
2:57eb7437f646 3:10141f4eaae9
4 <import>macros.xml</import> 4 <import>macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="requirements"/> 6 <expand macro="requirements"/>
7 <expand macro="version_command"/> 7 <expand macro="version_command"/>
8 <command detect_errors="exit_code"><![CDATA[ 8 <command detect_errors="exit_code"><![CDATA[
9 ## TODO check to remove ln, instead maybe rename output to ids
10 #for $read in $reads:
11 ln -s '$read' '$read.element_identifier' &&
12 #end for
13 mkdir '$errors.extra_files_path' &&
14 Rscript '$dada2_script' \${GALAXY_SLOTS:-1} 9 Rscript '$dada2_script' \${GALAXY_SLOTS:-1}
15 ]]></command> 10 ]]></command>
16 <configfiles> 11 <configfiles>
17 <configfile name="dada2_script"><![CDATA[ 12 <configfile name="dada2_script"><![CDATA[
18 library(ggplot2, quietly=T) 13 library(ggplot2, quietly=T)
21 args <- commandArgs(trailingOnly = TRUE) 16 args <- commandArgs(trailingOnly = TRUE)
22 nthreads <- as.integer(args[1]) 17 nthreads <- as.integer(args[1])
23 18
24 files <- c() 19 files <- c()
25 #for $read in $reads: 20 #for $read in $reads:
26 files <- c(files, '$read.element_identifier') 21 files <- c(files, '$read')
27 #end for 22 #end for
28 23
29 err <- learnErrors(files, nbases = 10**$nbases, 24 err <- learnErrors(files, nbases = 10**$nbases,
30 errorEstimationFunction = $advanced.errfoo, multithread = nthreads, 25 errorEstimationFunction = $advanced.errfoo, multithread = nthreads,
31 randomize = $advanced.randomize, MAX_CONSIST = $advanced.maxconsist, OMEGA_C = $advanced.omegac) 26 randomize = $advanced.randomize, MAX_CONSIST = $advanced.maxconsist, OMEGA_C = $advanced.omegac)
32 27
33 write.table(err\$err_out, file = '$errors', quote = F, sep = "\t", row.names = T, col.names = F) 28 ## write.table(err\$err_out, file = '$errors', quote = F, sep = "\t", row.names = T, col.names = F)
34 saveRDS(err, file=file.path('$errors.extra_files_path', "Rdata")) 29 saveRDS(err, file='$errors')
35
36 30
37 ## generate error plots 31 ## generate error plots
38 plot <- plotErrors(err, obs = $plotopt.obs, err_out = $plotopt.errout, err_in = $plotopt.errin, nominalQ = $plotopt.nominalQ) 32 plot <- plotErrors(err, obs = $plotopt.obs, err_out = $plotopt.errout, err_in = $plotopt.errin, nominalQ = $plotopt.nominalQ)
39 ggsave('output.pdf', plot, width = 20,height = 15,units = c("cm")) 33 ggsave('plot.pdf', plot, width = 20,height = 15,units = c("cm"))
40 ]]></configfile> 34 ]]></configfile>
41 </configfiles> 35 </configfiles>
42 <inputs> 36 <inputs>
43 <param name="reads" type="data" multiple="true" format="fastqsanger,fastqsanger.gz" label="Short read data" help="forward or reverse reads should be processed separately"/> 37 <param name="reads" type="data" multiple="true" format="fastqsanger,fastqsanger.gz" label="Short read data" help="forward or reverse reads should be processed separately"/>
44 <param name="nbases" type="integer" value="8" min="0" label="Magnitide of number of bases to use for learning"/> 38 <param argument="nbases" type="integer" value="8" min="0" label="Magnitide of number of bases to use for learning"/>
45 <section name="advanced" title="Advanced Option"> 39 <section name="advanced" title="Advanced Option">
46 <expand macro="errorEstimationFunction"/> 40 <expand macro="errorEstimationFunction"/>
47 <param name="randomize" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Randomize samples"/> 41 <param argument="randomize" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Randomize samples"/>
48 <param name="maxconsist" type="integer" value="10" min="0" label="Maximum number of times to step through the selfconsistency loop" help=""/> 42 <param name="maxconsist" argument="MAX_CONSIST" type="integer" value="10" min="0" label="Maximum number of times to step through the selfconsistency loop" help=""/>
49 <param name="omegac" type="integer" value="0" min="0" label="Threshold at which unique sequences inferred to contain errors are corrected" help=""/> 43 <param name="omegac" argument="OMEGA_C" type="integer" value="0" min="0" label="Threshold at which unique sequences inferred to contain errors are corrected" help=""/>
50 </section> 44 </section>
51 <section name="plotopt" title="Plotting Option"> 45 <section name="plotopt" title="Plotting Option">
52 <param name="obs" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot observed error rates"/> 46 <param name="obs" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot observed error rates"/>
53 <param name="errout" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot output error rates"/> 47 <param name="errout" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot output error rates"/>
54 <param name="errin" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Plot input error rates"/> 48 <param name="errin" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Plot input error rates"/>
55 <param name="nominalQ" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot expected error rates"/> 49 <param name="nominalQ" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot expected error rates"/>
56 </section> 50 </section>
57 </inputs> 51 </inputs>
58 <outputs> 52 <outputs>
59 <data name="errors" format="dada2_errorrates" label="${tool.name} on ${on_string}"/> 53 <data name="errors" format="dada2_errorrates" label="${tool.name} on ${on_string}"/>
60 <data name="output" format="pdf" from_work_dir="output.pdf" label="${tool.name} on ${on_string}: error rates plot"/> 54 <data name="plot" format="pdf" from_work_dir="plot.pdf" label="${tool.name} on ${on_string}: error rates plot"/>
61 </outputs> 55 </outputs>
62 <tests> 56 <tests>
63 <test> 57 <test>
64 <param name="reads" value="filterAndTrim_paired_F3D0_R1.fq.gz,filterAndTrim_paired_F3D141_R1.fq.gz" ftype="fastqsanger.gz"/> 58 <param name="reads" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/>
65 <output name="errors" value="learnErrors_forward.tab" ftype="dada_errorrates"> 59 <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates"/>
66 <extra_files type="rdata" name="Rdata" value="learnErrors_forward.Rdata" /> 60 <output name="plot" value="learnErrors_F3D0_R1.pdf" ftype="pdf" />
67 </output>
68 <output name="output" value="learnErrors_forward.pdf" ftype="pdf" />
69 </test> 61 </test>
70 <!-- test for creating input for dada results for reverse, not needed for testing --> 62 <!-- test for creating input for dada results for reverse, not needed for testing -->
71 <test> 63 <test>
72 <param name="reads" value="filterAndTrim_paired_F3D0_R2.fq.gz,filterAndTrim_paired_F3D141_R2.fq.gz" ftype="fastqsanger.gz"/> 64 <param name="reads" value="filterAndTrim_F3D0_R2.fq.gz" ftype="fastqsanger.gz"/>
73 <output name="errors" value="learnErrors_reverse.tab" ftype="dada_errorrates"> 65 <output name="errors" value="learnErrors_F3D0_R2.Rdata" ftype="dada2_errorrates"/>
74 <extra_files type="rdata" name="Rdata" value="learnErrors_reverse.Rdata" /> 66 <output name="plot" value="learnErrors_F3D0_R2.pdf" ftype="pdf" />
75 </output>
76 <output name="output" value="learnErrors_reverse.pdf" ftype="pdf" />
77 </test> 67 </test>
68 <!-- test w non-default parameters -->
78 <test> 69 <test>
79 <param name="reads" value="filterAndTrim_paired_F3D0_R1.fq.gz,filterAndTrim_paired_F3D141_R1.fq.gz" ftype="fastqsanger.gz"/> 70 <param name="reads" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/>
80 <param name="plotopt|obs" value="FALSE" /> 71 <param name="plotopt|obs" value="FALSE" />
81 <param name="plotopt|errout" value="FALSE" /> 72 <param name="plotopt|errout" value="FALSE" />
82 <param name="plotopt|errin" value="TRUE" /> 73 <param name="plotopt|errin" value="TRUE" />
83 <param name="plotopt|nominalQ" value="FALSE"/> 74 <param name="plotopt|nominalQ" value="FALSE"/>
84 <output name="errors" value="learnErrors_reverse.tab" ftype="dada_errorrates" /> 75 <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" />
85 <output name="output" value="learnErrors_reverse.pdf" ftype="pdf" /> 76 <output name="output" value="learnErrors_F3D0_R1.pdf" ftype="pdf" compare="sim_size" />
86 </test> 77 </test>
78 <!-- TODO test w multiple inputs -->
87 </tests> 79 </tests>
88 <help><![CDATA[ 80 <help><![CDATA[
89 Description 81 Description
90 ........... 82 ...........
91 83
92 Error rates are learned by alternating between sample inference and error rate estimation until convergence. Additionally a plot is generated that shows the observed frequency of each transition (eg. A->C) as a function of the associated quality score, the final estimated error rates (if they exist), the initial input rates, and the expected error rates under the nominal definition of quality scores. 84 Error rates are learned by alternating between sample inference and error rate estimation until convergence. Additionally a plot is generated that shows the observed frequency of each transition (eg. A->C) as a function of the associated quality score, the final estimated error rates (if they exist), the initial input rates, and the expected error rates under the nominal definition of quality scores.
93 85
94 Usage 86 Usage
95 ----- 87 .....
96 88
97 **Input** are the FASTQ dataset containing the filtered and trimmed reads of the samples. 89 **Input** are the FASTQ dataset containing the filtered and trimmed reads of the samples.
98 90
99 The main **output** of type dada2_errorrates shows the numeric matrix with the learned error rates. 91 **Output** a dataset with type *dada2_errorrates* (which is a RData file containing the output of dada2's learnErrors function) and a **plot** showing the error rates for each possible transition (A→C, A→G,...)
100 92
101 The **plot** shows the error rates for each possible transition (A→C, A→G,...) 93 - Points are the observed error rates for each consensus quality score.
102 94 - The black line shows the estimated error rates after convergence of the machine-learning algorithm.
103 - Points are the observed error rates for each consensus quality score. 95 - The red line shows the error rates expected under the nominal definition of the Q-score.
104 - The black line shows the estimated error rates after convergence of the machine-learning algorithm.
105 - The red line shows the error rates expected under the nominal definition of the Q-score.
106 96
107 The learned error rates are input the the *dada2: dada* tool. 97 The learned error rates are input the the *dada2: dada* tool.
108 98
109 Details 99 Details
110 ....... 100 .......
111 101
112 The learnErrors method learns a parametric this error model from the data, by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution. As in many machine-learning problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors). 102 The learnErrors method learns a parametric error model from the data, by alternating estimation of the error rates and inference of sample composition until they converge on a jointly consistent solution. As in many machine-learning problems, the algorithm must begin with an initial guess, for which the maximum possible error rates in this data are used (the error rates if only the most abundant sequence is correct and all the rest are errors).
113 103
114 It is expected that the estimated error rates (black lines in the plot) are in a good fit to the observed rates (points in the plot), and that the error rates drop with increased quality. Try to increase the **number of bases to use for learning** if this is not the case. 104 It is expected that the estimated error rates (black lines in the plot) are in a good fit to the observed rates (points in the plot), and that the error rates drop with increased quality. Try to increase the **number of bases to use for learning** if this is not the case.
115
116 105
117 Everything looks reasonable and we proceed with confidence. 106 @HELP_OVERVIEW@
118 ]]></help> 107 ]]></help>
119 <expand macro="citations"/> 108 <expand macro="citations"/>
120 </tool> 109 </tool>