Mercurial > repos > jdv > gottcha

diff gottcha.xml @ 0:2569a83977f5 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/gottcha commit 5d24210279e26623ae6c98f7551e3565fdc9bc48
author: jdv
date: Mon, 30 Jan 2017 19:07:21 -0500
children: 87efdde6105f
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gottcha.xml	Mon Jan 30 19:07:21 2017 -0500
@@ -0,0 +1,267 @@
+<tool id="gottcha" name="GOTTCHA" version="0.0.1">
+
+    <description>Read-based metagenome characterization</description>
+
+    <!-- ***************************************************************** -->
+
+    <requirements>
+        <requirement type="package" version="1.0b-564cf3b">gottcha</requirement>
+    </requirements>
+
+    <!-- ***************************************************************** -->
+
+    <version_command>gottcha.pl -h | perl -wnE'print "$1\n" for /VERSION: (\S+)/g'</version_command>
+
+    <!-- ***************************************************************** -->
+
+    <command detect_errors="aggressive">
+    <![CDATA[
+
+    gottcha.pl
+
+        --input    '${fn_in}'
+        --database '${db.fields.path}'
+        --threads  '\${GALAXY_SLOTS:-1}'
+        --outdir   './'
+        --prefix   results
+
+    ##--General Options------------------------------
+
+        --relAbu   ${general.rel_abund}
+        --mode ${general.output_full}
+        ${general.filt_plasmid}
+
+    ##--Split-trim Options---------------------------
+
+        --minQ     ${split.min_qual}
+        --fixL     ${split.fixed_len}
+        --ascii    ${split.qual_offset}
+
+    ##--Filtering Options----------------------------
+
+        --minCov   ${filter.min_cov}
+        --minMLHL  ${filter.min_mlhl}
+        --cCov     ${filter.c_cov}
+        --minLen   ${filter.min_len}
+        --minHits  ${filter.min_hits}
+
+    ]]>
+    </command>
+
+    <!-- ***************************************************************** -->
+
+    <inputs>
+
+        <param name="fn_in" type="data" format="fastq" label="Input reads" help="--input" />
+        <param name="db" type="select" label="Select a reference database" help="--database">
+            <options from_data_table="gottcha_indices">
+                <filter type="sort_by" column="2"/>
+                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
+            </options>
+        </param>
+
+        <section name="general" title="General Options" expanded="True">
+            <param name="rel_abund" type="select" label="Abundance field" help="--relAbu">
+                <option value="LINEAR_DOC" selected="true">Linear DOC</option>
+                <option value="LINEAR_LENGTH">Linear length</option>
+                <option value="TOTAL_BP_MAPPED">Total bp mapped</option>
+                <option value="HIT_COUNT">Hit count</option>
+            </param>
+            <param name="output_full" type="boolean" truevalue="full" falsevalue="summary" checked="no" label="Output full report" help="--mode full" />
+            <param name="filt_plasmid" type="boolean" truevalue="--noPlasmidHit" falsevalue="" checked="no" label="Filter plasmid hits" help="If true, ignore alignments to plasmids" />
+        </section>
+
+        <section name="split" title="Split-trim Options" expanded="False">
+            <param name="min_qual" size="4" type="integer" value="20" min="0" max="41" label="Minimum quality" help="Minimum quality for a read to be considered valid (0-41)" />
+            <param name="fixed_len" size="4" type="integer" value="30" min="1" label="Trim length" help="Fixed length to which each read will be trimmed" />
+            <param name="qual_offset" type="select" label="Quality offset" help="Base call quality offset for ASCII encoding">
+                <option value="33" selected="true">33</option>
+                <option value="64">64</option>
+            </param>
+        </section>
+
+        <section name="filter" title="Filtering Options" expanded="False">
+            <param name="min_cov" size="5" type="float" value="0.005" min="0" label="Minimum coverage" help="Minimum linear coverage to be considered valid in abundance calculation" />
+            <param name="min_mlhl" size="4" type="integer" value="5" min="0" label="Minimum MLHL" help="Minimum mean-linear-hit-length to be considered valid in abundance calculations"  />
+            <param name="c_cov" size="5" type="float" value="0.006" min="0" label="Critical coverage for MLHL" help="Critical coverage below which Minimum MLHL will cause an organism to fail" />
+            <param name="min_len" size="4" type="integer" value="100" min="0" label="Minimum length" help="Minimum unique length to be considered valid in abundance calculation" />
+            <param name="min_hits" size="4" type="integer" value="10" min="0" label="Minimum hits" help="Minimum number of hits to be considered valid in abundance calculation" />
+        </section>
+
+    </inputs>
+
+    <!-- ***************************************************************** -->
+
+    <outputs>
+
+        <data name="out_log" format="txt" label="GOTTCHA on ${on_string}: Log" from_work_dir="results.gottcha.log" />
+        <data name="out_tsv" format="txt" label="GOTTCHA on ${on_string}: Summary" from_work_dir="results.gottcha.tsv" />
+        <data name="out_full" format="txt" label="GOTTCHA on ${on_string}: Full Report" from_work_dir="results.gottcha_full.tsv">
+            <filter>output_full</filter>
+        </data>
+
+    </outputs>
+
+    <!-- ***************************************************************** -->
+
+    <tests>
+        <test>
+            <param name="db" value="test_db" />
+            <param name="fn_in" ftype="fastq" value="test.fq" />
+            <param name="output_full" value="no" />
+            <param name="min_hits" value="1" />
+            <output name="out_tsv" file="test_02.tsv" />
+            <output name="out_log" file="test_02.log" compare="sim_size" delta="2000"/>
+        </test>
+    </tests>
+
+    <!-- ***************************************************************** -->
+
+    <help>
+    <![CDATA[
+
+.. class:: infomark
+
+Description
+--------------------
+
+Genomic Origin Through Taxonomic CHAllenge (GOTTCHA) is an
+annotation-independent and signature-based metagenomic taxonomic profiling
+tool that has significantly smaller FDR than other profiling tools. This Perl
+script is a wrapper to run the GOTTCHA profiling tool with pre-computed
+signature databases. The procedure includes 3 major steps: split-trimming the
+input data, mapping reads to a GOTTCHA database using BWA, profiling/filtering
+the result.
+
+Options
+--------------------
+::
+
+    --relAbu|r   <STRING>  The field will be used to calculate relative
+                           abundance. You can specify one of the following
+                           fields: "LINEAR_LENGTH", "TOTAL_BP_MAPPED",
+                           "HIT_COUNT", "LINEAR_DOC".
+                           [default: LINEAR_DOC]
+    --mode|m     <STRING>  You can specify one of the output mode:
+                           "summary" : this mode will report a summary of
+                                       profiling result to *.gottcha.tsv file.
+                           "full"    : other than a summary, this mode will
+                                       report unfiltered result to
+                                       *.gottcha_full.tsv with more detail.
+                           "all"     : other than two tables, this mode will 
+                                       keep all output files that were 
+                                       generated by each profiling step.
+                           [default: summary]
+    --noPlasmidHit|n       Ignore alignments that hit to plasmids
+                           [default: null]
+
+  *** OPTIONS FOR SPLIT-TRIMMING READS ***
+
+    --minQ       <INT>     Minimum quality for a read to be considered valid
+                           (0-41) [default: 20]
+    --fixL       <INT>     Fixed length to which each trimmed read will be cut 
+                           down to [default: 30]
+    --ascii      <INT>     ASCII encoding of quality score (33 or 64) [default: 
+                           33] 
+
+  *** OPTIONS FOR FILTERING PROFILING RESULT ***
+
+    --minCov     <FLOAT>   Minimum linear coverage to be considered valid in 
+                           abundance calculation [default: 0.005]
+    --minMLHL    <INT>     Minimum Mean-Linear-Hit-Length to be considered valid
+                           in abundance calculation [default: 5]
+    --cCov       <FLOAT>   Critical coverage below which --minMLHL will cause an
+                           organism to fail [default: 0.006]
+    --minLen     <INT>     Minimum unique length to be considered valid in
+                           abundance calculation [default: 100]
+    --minHits    <INT>     Minimum number of hits to be considered valid in 
+                           abundance calculation [10]
+
+Interpreting Results
+--------------------
+
+GOTTCHA reports profiling results in a neat summary table
+by default. The tsv file will list the organism(s) at all taxonomic
+levels from STRAIN to PHYLUM, their linear length, total bases mapped,
+linear depth of coverage, and the normalized linear depth of coverage. The
+linear depth of coverage (LINEAR_DOC) is used to calculate relative
+abundance of each organism or taxonomic name in the sample.
+
+Summary table:
+
+=================  ==============================
+Column             Description
+=================  ==============================
+LEVEL              taxonomic rank
+NAME               taxonomic name
+REL_ABUNDANCE      relative abundance (equivalent to NORM_COV by default)
+LINEAR_LENGTH      number of non-overlapping bases covering the signatures
+TOTAL_BP_MAPPED    sum total of all hit lengths recruited to signatures
+HIT_COUNT          number of hits recruited to signatures
+HIT_COUNT_PLASMID  number of hits recruited to signatures
+READ_COUNT         number of reads recruited to signatures
+LINEAR_DOC         linear depth-of-coverage (TOTAL_BP_MAPPED / LINEAR_LENGTH)
+NORM_COV           normalized linear depth-of-coverage (LINEAR_DOC / SUM(LINEAR_DOC in certain level))
+=================  ==============================
+
+Other than a summary table, "full" report mode will report a table with more
+detail information from unfiltered results.  The explanation of each column in
+the full report is as follows:
+
+==================================  ==========================
+Column                              Description
+==================================  ==========================
+RANKNAME                            (REPLICON)  = replicon name (source + plasmid/chr)<br>(STRAIN)    = strain name<br>(SPECIES)   = species name<br>(GENUS)     = genus name<br>...
+NUM_SUBRANKS                        no. of distinct subranks for the current rank<br>(E.g.  the no. of SPECIES under the current GENUS)
+GPROJ_ENTRIES                       no. of genome projects (i.e. STRAINS) under this RANK NAME
+LINEAR_LENGTH                       N/O_LENGTH<br>= non-overlapping length <br>= no. of non-overlapping bases covering the unique DB
+UNIQUE_DB_LENGTH                    no. of unique bases for this organism
+FULL_REFDB_LENGTH                   no. of bases in full reference
+LINEAR_COV                          LINEAR_LENGTH / UNIQUE_DB_LENGTH
+HIT_COUNT                           no. of hits recruited to genome
+HIT_COUNT_PLASMID                   no. of hits recruited to plasmid
+READ_COUNT                          no. of reads recruited to genome
+FULL_HIT_COUNT                      no. of full-length read hits recruited to genome
+TOTAL_BP_MAPPED                     sum total of all hit lengths recruited to genome<br>= hit1.length + hit2.length + ... hitX.length<br>[formerly FOLD_COV_UNIQUE_SAMPLE]
+LINEAR_DOC                          linear depth-of-coverage<br>= fold coverage of sample's LINEAR_LENGTH <br>= TOTAL_BP_MAPPED / LINEAR_LENGTH<br>[formerly FOLD_COV_UNIQUE_REFDB]
+UREF_DOC                            unique reference's depth-of-coverage<br>= fold coverage of reference's UNIQUE_DB_LENGTH<br>= TOTAL_BP_MAPPED / UNIQUE_DB_LENGTH
+UREF_CMAX                           MAX COVERAGE OF REFDB POSSIBLE, GIVEN SAMPLE INPUT BASES<br>= Cmax = L0/l0 <br>= TOTAL_INPUT_BASES / UNIQUE_DB_LENGTH
+FRAC_HITS_POSSIBLE                  HIT_COUNT / TOTAL_INPUT_READS
+FRAC_BASES_POSSIBLE                 TOTAL_BP_MAPPED / TOTAL_INPUT_BASES
+MEAN_HIT_LENGTH                     TOTAL_BP_MAPPED / HIT_COUNT
+MEAN_LINEAR_HIT_LENGTH              LINEAR_LENGTH / HIT_COUNT
+best_SUBRANK                        name of the best subrank (determined by the highest LINEAR_COV)
+best_NUM_SUBRANKS                   no. of subranks supporting current "SUBRANK"<br>{SS} = no. of GI entries supporting this strain<br>{S}  = no. of strains supporting this species<br>{G}  = no. of species supporting this genus<br>{F}  = no. of genera supporting this family<br>{O}  = no. of families supporting this order <br>{C}  = no. of orders supporting this class<br>{P}  = no. of classes supporting this phylum
+best_GPROJ_ENTRIES                  no. of genome projects (i.e. STRAINS) under this best_SUBRANK<br>{SS} = no. of genome projects supporting this strain = 1<br>{S}  = no. of genome projects supporting this species<br>{G}  = no. of genome projects supporting this genus<br>{F}  = no. of genome projects supporting this family<br>{O}  = no. of genome projects supporting this order<br>{C}  = no. of genome projects supporting this class<br>{P}  = no. of genome projects supporting this phylum
+best_LINEAR_LENGTH                 
+best_UNIQUE_DB_LENGTH              
+best_FULL_REFDB_LENGTH             
+best_LINEAR_COV                    
+best_HIT_COUNT                     
+best_FULL_HIT_COUNT                
+best_TOTAL_BP_MAPPED               
+best_LINEAR_DOC (a.k.a. Abundance) 
+best_UREF_DOC                      
+best_UREF_CMAX                     
+best_FRAC_HITS_POSSIBLE            
+best_FRAC_BASES_POSSIBLE           
+best_MEAN_HIT_LENGTH               
+best_MEAN_LINEAR_HIT_LENGTH        
+CONTIG_COUNT                        no. of contiguous fragments<br> (after mapping & generating non-overlapping fragments)
+CONTIG_MEAN_LEN                     mean length of contigs (bp)
+CONTIG_STDEV_LEN                    standard deviation of contig lengths (bp)
+CONTIG_MINLEN                       length of smallest contig
+CONTIG_MAXLEN                       length of largest contig
+CONTIG_HISTOGRAM(LEN:FREQ)          contig Length Histogram<br> (in the format contigLength:frequency)
+==================================  ==========================
+
+    ]]>
+    </help>
+
+    <!-- ***************************************************************** -->
+    
+    <citations>
+        <citation type="doi">10.1093/nar/gkv180</citation>
+    </citations>
+
+</tool>
author	jdv
date	Mon, 30 Jan 2017 19:07:21 -0500
parents
children	87efdde6105f