Mercurial > repos > mvdbeek > wtdbg2

diff wtdbg2.xml @ 0:7bb3cf8b9a5d draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/wtdbg2 commit e2f82dabda7848017302214b99404c8466351b08
author: mvdbeek
date: Mon, 26 Nov 2018 07:51:32 -0500
children: fe60b4299555
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wtdbg2.xml	Mon Nov 26 07:51:32 2018 -0500
@@ -0,0 +1,98 @@
+<tool id="wtdbg2" name="wtdbg" version="2.2">
+    <description>De novo assembler for long noisy sequences</description>
+    <requirements>
+        <requirement type="package" version="2.2">wtdbg</requirement>
+        <!--  Update this once 2.2 is released, need to create env manually for now
+        <requirement type="package" version="1.9">samtools</requirement>
+        <requirement type="package" version="2.14">minimap2</requirement>
+        -->
+    </requirements>
+    <version_command>wtdbg2 -V</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+wtdbg2 -t \${GALAXY_SLOTS:-1} 
+-i '$inputs' 
+-fo prefix 
+#if $preset
+    -x '$preset.select_preset'
+#else 
+    -k $preset.kmer_length
+    -p $preset.hpc_kmer_length
+    -K $preset.high_freq_kmer
+    -E $preset.min_kmer_freq
+    -S $preset.sampling_rate
+    -l $preset.min_length_alignment
+    -m $preset.min_len_kmer_matching
+    -A $preset.keep_contained_reads_during_alignment
+    -s $preset.min_similarity 
+    -L $preset.longest_subread
+    -e $preset.minimum_coverage_valid_edge
+    -g '$preset.approximate_genome_size'
+    -X $preset.best_layout_depth
+#end if
+&& wtpoa-cns -t \${GALAXY_SLOTS:-1} -i prefix.ctg.lay -fo prefix.ctg.lay.fa &&
+minimap2 -t \${GALAXY_SLOTS:-1} -x map-pb -a prefix.ctg.lay.fa '$inputs' | samtools view -Sb - >prefix.ctg.lay.map.bam &&
+samtools sort prefix.ctg.lay.map.bam > prefix.ctg.lay.map.srt
+#if $polish_consensus
+    && samtools view prefix.ctg.lay.map.srt.bam | wtpoa-cns -t \${GALAXY_SLOTS:-1} -d prefix.ctg.lay.fa -i - -fo prefix.ctg.lay.2nd.fa
+#end if
+    ]]></command>
+    <inputs>
+        <param name="inputs" type="data" format="fasta,fastq,fasta.gz,fastq.gz" help="Select input reads"/>
+        <conditional name="preset">
+            <param name="select_preset" type="select">
+                <option value="rsII/rs">PacBio RSII (-p 21 -S 4 -s 0.05 -L 5000)</option>
+                <option value="sequel/sq">Pacbio Sequel (-p 0 -k 15 -AS 2 -s 0.05 -L 5000)</option>
+                <option value="nanopore/ont">Nanopore (-p 19 -AS 2 -s 0.05 -L 5000)</option>
+                <option value="corrected/ccs">Corrected Reads (-p 0 -k 19 -AS 4 -s 0.5 -L 5000)</option>
+                <option value="manual">Specify Options</option>
+            </param>
+            <when value="rsII/rs">
+            </when>
+            <when value="sequel/sq">
+            </when>
+            <when value="nanopore/ont">
+            </when>
+            <when value="corrected/ccs">
+            </when>
+            <when value="manual">
+                <param name="kmer_length" argument="-k" type="integer" value="0" min="0" max="25" label="kmer fsize"/>
+                <param name="hpc_kmer_length" argument="-p" type="integer" value="0" min="0" max="25" label="kmer psize"/>
+                <param name="high_freq_kmer" argument="-K" type="float" value="1000.05" label="Filter high frequency kmers, maybe repetitive" help=">= 1000 and indexing >= (1 - 0.05) * total_kmers_count"/>
+                <param name="min_kmer_freq" argument="-E" type="integer" value="2" label="Min kmer frequency"/>
+                <param name="sampling_rate" argument="-S" type="float" value="4.0" label="Subsampling kmers, 1/(S) kmers are indexed" help="-S is very useful in saving memeory and speeding up. Please note that subsampling kmers will have less matched length"/>
+                <param name="min_length_alignment" argument="-l" type="integer" value="2048" label="Min length of alignment"/>
+                <param name="min_length_kmer_matching" argument="-m" type="integer" value="200" label="Min matched length by kmer matching"/>
+                <param name="keep_contained_reads_during_alignment" argument="-A" type="boolean" truevalue="-A" falsevalue="" label="Keep contained reads during alignment?"/>
+                <param name="min_similarity" argument="-s" type="float" label="Min similarity, calculated by kmer matched length" value="0.05" min="0" max="1"/>
+                <param name="longest_subread" argument="-L" type="integer" value="0" label="Choose the longest subread and drop reads shorter than this value"/>
+                <param name="minimum_coverage_valid_edge" argument="-e" type="integer" value="3" label="Min read depth of a valid edge"/>
+                <param name="approximate_genome_size" argument="-g" type="integer" value="0" label="Approximate genome size in Megabases"/>
+                <param name="best_layout_depth" argument="-X" type="integer" value="50" min="1" label="Select best reads for this amount of coverage"/>
+            </when>
+        </conditional>
+        <param name="polish_consensus" type="boolean" label="Polish consensus" help="Select No if you plan to use a different tool for polishing the contig consensus sequences"/>
+    </inputs>
+    <outputs>
+        <data name="contigs" format="txt" from_work_dir="prefix.ctg.lay"/>
+        <data name="consensus" format="fasta" from_work_dir="prefix.ctg.lay.fa"/>
+        <data name="polished_consensus" format="fasta" from_work_dir="prefix.ctg.lay.2nd.fa"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="inputs" value="ecoli-reads.fasta" ftype="fasta"/>
+            <param name="preset|select_preset" value="rsII/rs"/>
+            <output name="contigs" value="contigs.txt"/>
+            <output name="consensus" value="consensus.fa"/>
+            <output name="polished_consensus" value="polished_consensus.fa"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+Wtdbg2 is a de novo sequence assembler for long noisy reads produced by PacBio or Oxford Nanopore Technologies (ONT). It assembles raw reads without error correction and then builds the consensus from intermediate assembly output. Wtdbg2 is able to assemble the human and even the 32Gb Axolotl genome at a speed tens of times faster than CANU and FALCON while producing contigs of comparable base accuracy.
+
+During assembly, wtdbg2 chops reads into 1024bp segments, merges similar segments into a vertex and connects vertices based on the segment adjacency on reads. The resulting graph is called fuzzy Bruijn graph (FBG). It is akin to De Bruijn graph but permits mismatches/gaps and keeps read paths when collapsing k-mers. The use of FBG distinguishes wtdbg2 from the majority of long-read assemblers.
+
+Wtdbg2 combines normal k-mers and homopolymer-compressed (HPC) k-mers to find read overlaps. Option -k specifies the length of normal k-mers, while -p specifies the length of HPC k-mers. By default, wtdbg2 samples a fourth of all k-mers by their hashcodes. For data of relatively low coverage, you may increase this sampling rate by reducing -S. This will greatly increase the peak memory as a cost, though. Option -e, which defaults to 3, specifies the minimum read coverage of an edge in the assembly graph. You may adjust this option according to the overall sequencing depth, too. Option -A also helps relatively low coverage data at the cost of performance. For PacBio data, -L5000 often leads to better assemblies emperically, so is recommended.
+    ]]></help>
+    <citations>
+    </citations>
+</tool>
author	mvdbeek
date	Mon, 26 Nov 2018 07:51:32 -0500
parents
children	fe60b4299555