diff dada2_makeSequenceTable.xml @ 2:d2e7c5f8a9f7 draft

planemo upload for repository https://github.com/bernt-matthias/mb-galaxy-tools/tree/topic/dada2/tools/dada2 commit 5b1603bbcd3f139cad5c876be83fcb39697b5613-dirty
author matthias
date Tue, 09 Apr 2019 07:10:43 -0400
parents 98e24c66eeb2
children c3834c230b0a
line wrap: on
line diff
--- a/dada2_makeSequenceTable.xml	Fri Mar 08 08:43:09 2019 -0500
+++ b/dada2_makeSequenceTable.xml	Tue Apr 09 07:10:43 2019 -0400
@@ -1,4 +1,4 @@
-<tool id="dada2_makeSequenceTable" name="dada2: makeSequenceTable" version="@DADA2_VERSION@">
+<tool id="dada2_makeSequenceTable" name="dada2: makeSequenceTable" version="@DADA2_VERSION@+galaxy@WRAPPER_VERSION@">
     <description>construct a sequence table (analogous to OTU table)</description>
     <macros>
         <import>macros.xml</import>
@@ -13,7 +13,7 @@
 @READ_FOO@
 
 library(dada2, quietly=T)
-#if $filter.plot == "yes"
+#if $plot == "yes"
 library(ggplot2, quietly=T)
 #end if
 
@@ -28,21 +28,24 @@
 ## make sequence table
 seqtab <- makeSequenceTable(samples, orderBy = "$orderby")
 
-## get and plot length distribution
-seqlen <- data.frame(length = nchar(getSequences(seqtab)))
-seqlenq <- quantile(seqlen\$length, probs=c( $filter.plower, $filter.pupper  ))
+
+reads.per.seqlen <- tapply(colSums(seqtab), factor(nchar(getSequences(seqtab))), sum)
+df <- data.frame(length=as.numeric(names(reads.per.seqlen)), count=reads.per.seqlen)
 
-#if $filter.plot == "yes"
-pdf( '$plot' )
-ggplot(seqlen) + 
-    geom_histogram( aes(x=length), binwidth=1 ) + 
-    geom_vline(xintercept=c(seqlenq[1]-0.5, seqlenq[2]+0.5))
+#if $plot == "yes"
+pdf( '$plot_output' )
+ggplot(data=df, aes(x=length, y=count)) + 
+    geom_col() + 
+#if $filter_cond.filter_select != "no"
+    geom_vline( xintercept=c($filter_cond.min-0.5, $filter_cond.max+0.5) ) + 
+#end if
+    theme_bw()
 bequiet <- dev.off()
 #end if
 
 ## filter by seqlengths
-#if $filter.filter == "yes"
-seqtab <- seqtab[,nchar(colnames(seqtab)) %in% seqlenq]
+#if $filter_cond.filter_select != "no"
+    seqtab <- seqtab[, nchar(colnames(seqtab)) %in% seq($filter_cond.min, $filter_cond.max)]
 #end if
 
 write.table(seqtab, "$stable", quote=F, sep="\t", row.names = T, col.names = NA)
@@ -54,22 +57,57 @@
             <option value="abundance">abundance</option>
             <option value="nsamples">nsamples</option>
         </param>
-        <section name="filter" title="Plot and filter sequence lengths">
-            <param name="plower" type="float" min="0" max="1" value="0.01" label="lower quantile" />
-            <param name="pupper" type="float" min="0" max="1" value="0.99" label="upper quantile" />
-            <param name="plot" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="plot sequence length distribution" />
-            <param name="filter" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="filter sequence length using quantiles" />
-	</section>
+        <conditional name="filter_cond">
+            <param name="filter_select" type="select" label="Filter method">
+                <option value="no">No filter</option>
+                <option value="minmax">Specify minimum and maximum sequence lengths</option>
+            </param>
+            <when value="no"/>
+            <when value="minmax">
+                <param name="min" type="integer" value="" label="Minimum sequence length"/>
+                <param name="max" type="integer" value="" label="Maximum sequence length"/>
+            </when>
+        </conditional>
+        <param name="plot" type="boolean" truevalue="yes" falsevalue="no" checked="true" label="plot sequence length distribution" />
     </inputs>
     <outputs>
         <data name="stable" format="dada2_sequencetable" label="${tool.name} on ${on_string}"/>
-        <data name="plot" format="pdf" label="${tool.name} on ${on_string}: sequence length distribution">
-            <filter>filter['plot']</filter>
-	</data>
+        <data name="plot_output" format="pdf" label="${tool.name} on ${on_string}: sequence length distribution">
+            <filter>plot</filter>
+        </data>
     </outputs>
 
     <help><![CDATA[
-        TODO: Fill in help.
+This function constructs a sequence table (analogous to an OTU table) from the provided list of
+samples.
+
+Custom Reference data sets
+--------------------------
+
+For ** taxonomy assignment ** the following is needed: 
+
+- a reference fasta data base 
+- a comma separated list of taxonomic ranks present in the reference data base 
+
+The reference fasta data base for taxonomic assignment (fasta or compressed fasta) needs to encode the taxonomy corresponding to each sequence in the fasta header lines in the following fashion (note, the second sequence is not assigned down to level 6):
+
+::
+
+    >Level1;Level2;Level3;Level4;Level5;Level6;
+    ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC
+    >Level1;Level2;Level3;Level4;Level5;
+    CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC
+
+The list of required taxonomic ranks could be for instance: "Kingdom,Phylum,Class,Order,Family,Genus"
+
+The reference data base for ** species assignment ** is a fasta file (or compressed fasta file), with the id line formatted as follows:
+
+::
+
+    >ID Genus species
+    ACCTAGAAAGTCGTAGATCGAAGTTGAAGCATCGCCCGATGATCGTCTGAAGCTGTAGCATGAGTCGATTTTCACATTCAGGGATACCATAGGATAC
+    >ID Genus species
+    CGCTAGAAAGTCGTAGAAGGCTCGGAGGTTTGAAGCATCGCCCGATGGGATCTCGTTGCTGTAGCATGAGTACGGACATTCAGGGATCATAGGATAC
     ]]></help>
     <expand macro="citations"/>
 </tool>