Mercurial > repos > matthias > dada2_learnerrors
changeset 5:9aeea74a1fc9 draft
planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit 990192685955e9cda0282e348c28ef6462d88a38
| author | matthias |
|---|---|
| date | Sun, 05 May 2019 12:22:22 -0400 |
| parents | 9f888de151d1 |
| children | f9040a48d23f |
| files | README.md dada2_learnErrors.xml macros.xml test-data/.reference.fa.swp test-data/.reference_species.fa.swp test-data/assignTaxonomyAddspecies_F3D0_boot.tab test-data/filterAndTrim_F3D0.tab test-data/makeSequenceTable_F3D0.pdf test-data/makeSequenceTable_F3D0.tab test-data/mergePairs_F3D0_nondefault.Rdata test-data/qualityProfile.pdf test-data/removeBimeraDenovo_F3D0.tab test-data/removeBimeraDenovo_F3D0_dada_uniques.tab test-data/removeBimeraDenovo_F3D0_derep_uniques.tab test-data/removeBimeraDenovo_F3D0_mergepairs.Rdata test-data/seqCounts_F3D0_dadaF.tab test-data/seqCounts_F3D0_derepF.tab test-data/seqCounts_F3D0_filter.tab test-data/seqCounts_F3D0_merge.tab test-data/seqCounts_F3D0_nochim.tab test-data/seqCounts_F3D0_seqtab.tab todo.txt |
| diffstat | 22 files changed, 2710 insertions(+), 39 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README.md Sun May 05 12:22:22 2019 -0400 @@ -0,0 +1,39 @@ +Wrappers for the core functionality of the dada2 package https://benjjneb.github.io/dada2/index.html. + +- filterAndTrim +- derep +- learnErrors +- dada +- mergePairs +- makeSequenceTable +- removeBimeraDenovo + +Datatypes +========= + +The dada2 Galaxy wrappers use a few extra data types to ensure that only inputs of the correct type can be used. + +For the outputs of derep, dada, learnErrors, and mergePairs the following datatypes are used that derive from Rdata (which contains the named list that is returned from the corresponding dada function): + +- dada2_derep (Rdata: named list see docs for derep-class) +- dada2_dada (Rdata: named list, see docs for dada-class) +- dada2_errorrates (Rdata: named list, see docs for learnErrors) +- dada2_mergepairs (Rdata: named list, see docs for mergePairs) + +For the outputs of makeSequenceTable and removeBimeraDenovo the following data types are used which derive from tabular: + +- dada2_uniques +-- in R a named integer vector (names are the unique sequences) +-- in Galaxy written as a table (each row corresponding to a unique sequence, column 1: the sequence, column 2: the count) +- dada2_sequencetable +-- in R a named integer matrix (rows = samples, columns = unique sequences) +-- in Galaxy written as a table (rows = unique sequences, columns = samples) + +Note the difference between the R and Galaxy representations! The main motivation is that the dada2_sequencetable is analogous to OTU tables as produced for instance by qiime (and it seemed natural to extend this to the uniques which are essentially a sequencetables of single samples). + + +TODOs +===== + +- implememt getUniques tool to view intermediate results? +- implement tests for cached reference data
--- a/dada2_learnErrors.xml Mon Apr 29 09:48:19 2019 -0400 +++ b/dada2_learnErrors.xml Sun May 05 12:22:22 2019 -0400 @@ -17,7 +17,7 @@ nthreads <- as.integer(args[1]) files <- c() -#for $read in $reads: +#for $read in $fls: files <- c(files, '$read') #end for @@ -29,24 +29,26 @@ saveRDS(err, file='$errors') ## generate error plots -plot <- plotErrors(err, obs = $plotopt.obs, err_out = $plotopt.errout, err_in = $plotopt.errin, nominalQ = $plotopt.nominalQ) +plot <- plotErrors(err, obs = $plotopt.obs, err_out = $plotopt.err_out, err_in = $plotopt.err_in, nominalQ = $plotopt.nominalQ) ggsave('plot.pdf', plot, width = 20,height = 15,units = c("cm")) ]]></configfile> </configfiles> <inputs> - <param name="reads" type="data" multiple="true" format="fastqsanger,fastqsanger.gz" label="Short read data" help="forward or reverse reads should be processed separately"/> + <param argument="fls" type="data" multiple="true" format="fastqsanger,fastqsanger.gz" label="Short read data" help="forward or reverse reads should be processed separately"/> <param argument="nbases" type="integer" value="8" min="0" label="Magnitide of number of bases to use for learning"/> <section name="advanced" title="Advanced Option"> <expand macro="errorEstimationFunction"/> - <param argument="randomize" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Randomize samples"/> - <param name="maxconsist" argument="MAX_CONSIST" type="integer" value="10" min="0" label="Maximum number of times to step through the selfconsistency loop" help=""/> - <param name="omegac" argument="OMEGA_C" type="integer" value="0" min="0" label="Threshold at which unique sequences inferred to contain errors are corrected" help=""/> + <param argument="randomize" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Randomize samples" help="Pick samples at random, otherwise samples are read in the provided order until enough reads are obtained (default)."/> + <param name="maxconsist" argument="MAX_CONSIST" type="integer" value="10" min="0" label="Maximum number of times to step through the selfconsistency loop" help="If convergence was not reached in MAX_CONSIST steps, the estimated error rates in the last step are returned."/> + <param name="omegac" argument="OMEGA_C" type="float" value="0" min="0" label="Threshold at which unique sequences inferred to contain errors are corrected" help="For reasons of convergence, and because it is +more conservative, it is recommended to set this value to 0, which means that +all reads are counted and contribute to estimating the error rates."/> </section> <section name="plotopt" title="Plotting Option"> - <param name="obs" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot observed error rates"/> - <param name="errout" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot output error rates"/> - <param name="errin" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Plot input error rates"/> - <param name="nominalQ" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot expected error rates"/> + <param argument="obs" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot observed error rates"/> + <param argument="err_out" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot output error rates"/> + <param argument="err_in" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="Plot input error rates"/> + <param argument="nominalQ" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE" label="Plot expected error rates"/> </section> </inputs> <outputs> @@ -55,25 +57,30 @@ </outputs> <tests> <test> - <param name="reads" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/> + <param name="fls" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/> <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates"/> <output name="plot" value="learnErrors_F3D0_R1.pdf" ftype="pdf" /> </test> <!-- test for creating input for dada results for reverse, not needed for testing --> <test> - <param name="reads" value="filterAndTrim_F3D0_R2.fq.gz" ftype="fastqsanger.gz"/> + <param name="fls" value="filterAndTrim_F3D0_R2.fq.gz" ftype="fastqsanger.gz"/> <output name="errors" value="learnErrors_F3D0_R2.Rdata" ftype="dada2_errorrates"/> <output name="plot" value="learnErrors_F3D0_R2.pdf" ftype="pdf" /> </test> <!-- test w non-default parameters --> <test> - <param name="reads" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/> + <param name="fls" value="filterAndTrim_F3D0_R1.fq.gz" ftype="fastqsanger.gz"/> + <param name="nbases" value="6" /> + <param name="advanced|errfoo" value="noqualErrfun" /> + <param name="advanced|randomize" value="TRUE" /> + <param name="advanced|maxconsist" value="5" /> + <param name="advanced|omegac" value="1e-10" /> <param name="plotopt|obs" value="FALSE" /> - <param name="plotopt|errout" value="FALSE" /> - <param name="plotopt|errin" value="TRUE" /> + <param name="plotopt|err_out" value="FALSE" /> + <param name="plotopt|err_in" value="TRUE" /> <param name="plotopt|nominalQ" value="FALSE"/> - <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" /> - <output name="output" value="learnErrors_F3D0_R1.pdf" ftype="pdf" compare="sim_size" /> + <output name="errors" value="learnErrors_F3D0_R1.Rdata" ftype="dada2_errorrates" compare="sim_size" delta="14000"/> + <output name="plot" value="learnErrors_F3D0_R1.pdf" ftype="pdf" compare="sim_size" /> </test> <!-- TODO test w multiple inputs --> </tests> @@ -83,6 +90,8 @@ Error rates are learned by alternating between sample inference and error rate estimation until convergence. Additionally a plot is generated that shows the observed frequency of each transition (eg. A->C) as a function of the associated quality score, the final estimated error rates (if they exist), the initial input rates, and the expected error rates under the nominal definition of quality scores. +In addition a plot is generated (with plotErrors) that shows the observed frequency of each transition (eg. A->C) as a function of the associated quality score. Also the final estimated error rates (if they exist) are shown. Optionally also the initial input rates and the expected error rates under the nominal definition of quality scores can be added to the plot. + Usage ..... @@ -103,6 +112,12 @@ It is expected that the estimated error rates (black lines in the plot) are in a good fit to the observed rates (points in the plot), and that the error rates drop with increased quality. Try to increase the **number of bases to use for learning** if this is not the case. +Error functions: + +- loessErrfun: accepts a matrix of observed transitions, with each transition corresponding to a row (eg. row 2 = A->C) and each column to a quality score (eg. col 31 = Q30). It returns a matrix of estimated error rates of the same shape. Error rates are estimates by a loess fit of the observed rates of each transition as a function of the quality score. Self-transitions (i.e. A->A) are taken to be the left-over probability. +- noqualErrfun: accepts a matrix of observed transitions, groups together all observed transitions regardless of quality scores, and estimates the error rate for that transition as the observed fraction of those transitions. The effect is that quality scores will be effectively ignored. +- PacBioErrfun: This function accepts a matrix of observed transitions from PacBio CCS amplicon sequencing data, with each transition corresponding to a row (eg. row 2 = A->C) and each column to a quality score (eg. col 31 = Q30). It returns a matrix of estimated error rates of the same shape. Error rates are estimates by loessErrfun for quality scores 0-92, and individually by the maximum likelihood estimate for the maximum quality score of 93. + @HELP_OVERVIEW@ ]]></help> <expand macro="citations"/>
--- a/macros.xml Mon Apr 29 09:48:19 2019 -0400 +++ b/macros.xml Sun May 05 12:22:22 2019 -0400 @@ -25,23 +25,44 @@ <token name="@DADA_UNIQUES@">dada2_derep,dada2_dada,dada2_mergepairs</token> + <!-- function to read dada2 data types + - derep, dada, and mergepairs are simply read as RDS + - sequence_table is a named integer matrix (rows=samples, columns=ASVs) + - uniques is a named integer vector (columns=ASVs, only one rows)--> <token name="@READ_FOO@"><![CDATA[ + read.uniques <- function ( fname ) { + p <- read.table(fname, header=F, sep="\t") + n <-x[,2] + names(n)<-x[,1] + } #def read_data($dataset) - #if $dataset.is_of_type('dada2_derep') - readRDS('$dataset) - #else if $dataset.is_of_type('dada2_dada') + #if $dataset.is_of_type('dada2_sequencetable') + t(as.matrix( read.table('$dataset', header=T, sep="\t", row.names=1) )) + #else if $dataset.is_of_type('dada2_uniques') + read.uniques('$dataset') + #else if $dataset.is_of_type('tabular') + read.table('$dataset', header=T, sep="\t", row.names=1) + #else readRDS('$dataset') - #else if $dataset.is_of_type('dada2_sequencetable') - as.matrix( read.table('$dataset', header=T, sep="\t", row.names=1) ) - #else if $dataset.is_of_type('dada2_mergepairs') - readRDS('$dataset') - #else if $dataset.is_of_type('tabular') - read.table('$dataset', header=T, sep="\t", row.names=1 ) - #else - #raise Exception("error: unknown input type") #end if #end def ]]></token> + <!-- function to write dada2 data types (the content or the R variable 'out' is written) + - derep, dada, and mergepairs are written as RDS + - sequence_table is a named integer matrix (rows=samples, columns=ASVs) + - uniques is a named integer vector (columns=ASVs, only one rows)--> + <token name="@WRITE_FOO@"><![CDATA[ +write.data <- function( data, fname, type ){ + if( type == 'dada2_uniques'){ + write.table(data, file = fname, quote = F, sep = "\t", row.names = T, col.names = F) + }else if( type== 'dada2_sequencetable'){ + write.table(t(data), file=fname, quote=F, sep="\t", row.names = T, col.names = NA) + }else{ + saveRDS(data, file=fname) + } +} + ]]></token> + <!-- for filterAndTrim --> <xml name="trimmers"> <section name="trim" title="Trimming parameters"> @@ -63,9 +84,9 @@ <xml name="errorEstimationFunction"> <param name="errfoo" argument="errorEstimationFunction" type="select" label="Error function"> - <option value="loessErrfun">loess</option> - <option value="noqualErrfun">noqual</option> - <option value="PacBioErrfun">PacBio</option> + <option value="loessErrfun">loess: Use a loess fit to estimate error rates from transition counts</option> + <option value="noqualErrfun">noqual: Estimate error rates for each type of transition while ignoring quality scores.</option> + <option value="PacBioErrfun">PacBio: Estimate error rates from transition counts in PacBio CCS data.</option> </param> </xml> <token name="@HELP_OVERVIEW@"><