Mercurial > repos > jdv > gottcha

<tool id="gottcha" name="GOTTCHA" version="0.001">

    <description>Read-based metagenome characterization</description>

    <!-- ***************************************************************** -->

    <requirements>
        <requirement type="package" version="0.7.7">bwa</requirement>
        <requirement type="package" version="1.0b-564cf3b">gottcha</requirement>
    </requirements>

    <!-- ***************************************************************** -->

    <version_command>gottcha.pl -h | perl -wnE'print "$1\n" for /VERSION: (\S+)/g'</version_command>

    <!-- ***************************************************************** -->

    <command detect_errors="aggressive">
    <![CDATA[

    gottcha.pl

        --input    '${fn_in}'
        --database '${db.fields.path}'
        --threads  '\${GALAXY_SLOTS:-1}'
        --outdir   './'
        --prefix   results

    ##--General Options------------------------------

        --relAbu   ${general.rel_abund}
        --mode ${general.output_full}
        ${general.filt_plasmid}

    ##--Split-trim Options---------------------------

        --minQ     ${split.min_qual}
        --fixL     ${split.fixed_len}
        --ascii    ${split.qual_offset}

    ##--Filtering Options----------------------------

        --minCov   ${filter.min_cov}
        --minMLHL  ${filter.min_mlhl}
        --cCov     ${filter.c_cov}
        --minLen   ${filter.min_len}
        --minHits  ${filter.min_hits}

    ]]>
    </command>

    <!-- ***************************************************************** -->

    <inputs>

        <param name="fn_in" type="data" format="fastq" label="Input reads" help="--input" />
        <param name="db" type="select" label="Select a reference database" help="--database">
            <options from_data_table="gottcha_indices">
                <filter type="sort_by" column="2"/>
                <validator type="no_options" message="No indexes are available for the selected input dataset"/>
            </options>
        </param>

        <section name="general" title="General Options" expanded="True">
            <param name="rel_abund" type="select" label="Abundance field" help="--relAbu">
                <option value="LINEAR_DOC" selected="true">Linear DOC</option>
                <option value="LINEAR_LENGTH">Linear length</option>
                <option value="TOTAL_BP_MAPPED">Total bp mapped</option>
                <option value="HIT_COUNT">Hit count</option>
            </param>
            <param name="output_full" type="boolean" truevalue="full" falsevalue="summary" checked="no" label="Output full report" help="--mode full" />
            <param name="filt_plasmid" type="boolean" truevalue="--noPlasmidHit" falsevalue="" checked="no" label="Filter plasmid hits" help="If true, ignore alignments to plasmids" />
        </section>

        <section name="split" title="Split-trim Options" expanded="False">
            <param name="min_qual" size="4" type="integer" value="20" min="0" max="41" label="Minimum quality" help="Minimum quality for a read to be considered valid (0-41)" />
            <param name="fixed_len" size="4" type="integer" value="30" min="1" label="Trim length" help="Fixed length to which each read will be trimmed" />
            <param name="qual_offset" type="select" label="Quality offset" help="Base call quality offset for ASCII encoding">
                <option value="33" selected="true">33</option>
                <option value="64">64</option>
            </param>
        </section>

        <section name="filter" title="Filtering Options" expanded="False">
            <param name="min_cov" size="5" type="float" value="0.005" min="0" label="Minimum coverage" help="Minimum linear coverage to be considered valid in abundance calculation" />
            <param name="min_mlhl" size="4" type="integer" value="5" min="0" label="Minimum MLHL" help="Minimum mean-linear-hit-length to be considered valid in abundance calculations"  />
            <param name="c_cov" size="5" type="float" value="0.006" min="0" label="Critical coverage for MLHL" help="Critical coverage below which Minimum MLHL will cause an organism to fail" />
            <param name="min_len" size="4" type="integer" value="100" min="0" label="Minimum length" help="Minimum unique length to be considered valid in abundance calculation" />
            <param name="min_hits" size="4" type="integer" value="10" min="0" label="Minimum hits" help="Minimum number of hits to be considered valid in abundance calculation" />
        </section>

    </inputs>

    <!-- ***************************************************************** -->

    <outputs>

        <data name="out_log" format="txt" label="GOTTCHA on ${on_string}: Log" from_work_dir="results.gottcha.log" />
        <data name="out_tsv" format="txt" label="GOTTCHA on ${on_string}: Summary" from_work_dir="results.gottcha.tsv" />
        <data name="out_full" format="txt" label="GOTTCHA on ${on_string}: Full Report" from_work_dir="results.gottcha_full.tsv">
            <filter>output_full</filter>
        </data>

    </outputs>

    <!-- ***************************************************************** -->

    <tests>
        <test>
            <param name="db" value="test_db" />
            <param name="fn_in" ftype="fastq" value="test.fq" />
            <param name="output_full" value="no" />
            <param name="min_hits" value="1" />
            <output name="out_tsv" file="test_02.tsv" />
            <output name="out_log" file="test_02.log" compare="sim_size" delta="2000"/>
        </test>
    </tests>

    <!-- ***************************************************************** -->

    <help>
    <![CDATA[

**Description**

Genomic Origin Through Taxonomic CHAllenge (GOTTCHA) is an
annotation-independent and signature-based metagenomic taxonomic profiling
tool that has significantly smaller FDR than other profiling tools. This Perl
script is a wrapper to run the GOTTCHA profiling tool with pre-computed
signature databases. The procedure includes 3 major steps: split-trimming the
input data, mapping reads to a GOTTCHA database using BWA, profiling/filtering
the result.

**Options**
::

    --relAbu|r   <STRING>  The field will be used to calculate relative
                           abundance. You can specify one of the following
                           fields: "LINEAR_LENGTH", "TOTAL_BP_MAPPED",
                           "HIT_COUNT", "LINEAR_DOC".
                           [default: LINEAR_DOC]
    --mode|m     <STRING>  You can specify one of the output mode:
                           "summary" : this mode will report a summary of
                                       profiling result to *.gottcha.tsv file.
                           "full"    : other than a summary, this mode will
                                       report unfiltered result to
                                       *.gottcha_full.tsv with more detail.
                           "all"     : other than two tables, this mode will
                                       keep all output files that were
                                       generated by each profiling step.
                           [default: summary]
    --noPlasmidHit|n       Ignore alignments that hit to plasmids
                           [default: null]

  *** OPTIONS FOR SPLIT-TRIMMING READS ***

    --minQ       <INT>     Minimum quality for a read to be considered valid
                           (0-41) [default: 20]
    --fixL       <INT>     Fixed length to which each trimmed read will be cut
                           down to [default: 30]
    --ascii      <INT>     ASCII encoding of quality score (33 or 64) [default:
                           33]

  *** OPTIONS FOR FILTERING PROFILING RESULT ***

    --minCov     <FLOAT>   Minimum linear coverage to be considered valid in
                           abundance calculation [default: 0.005]
    --minMLHL    <INT>     Minimum Mean-Linear-Hit-Length to be considered valid
                           in abundance calculation [default: 5]
    --cCov       <FLOAT>   Critical coverage below which --minMLHL will cause an
                           organism to fail [default: 0.006]
    --minLen     <INT>     Minimum unique length to be considered valid in
                           abundance calculation [default: 100]
    --minHits    <INT>     Minimum number of hits to be considered valid in
                           abundance calculation [10]

**Interpreting Results**

GOTTCHA reports profiling results in a neat summary table
by default. The tsv file will list the organism(s) at all taxonomic
levels from STRAIN to PHYLUM, their linear length, total bases mapped,
linear depth of coverage, and the normalized linear depth of coverage. The
linear depth of coverage (LINEAR_DOC) is used to calculate relative
abundance of each organism or taxonomic name in the sample.

Summary table:

=================  ==============================
Column             Description
=================  ==============================
LEVEL              taxonomic rank
NAME               taxonomic name
REL_ABUNDANCE      relative abundance (equivalent to NORM_COV by default)
LINEAR_LENGTH      number of non-overlapping bases covering the signatures
TOTAL_BP_MAPPED    sum total of all hit lengths recruited to signatures
HIT_COUNT          number of hits recruited to signatures
HIT_COUNT_PLASMID  number of hits recruited to signatures
READ_COUNT         number of reads recruited to signatures
LINEAR_DOC         linear depth-of-coverage (TOTAL_BP_MAPPED / LINEAR_LENGTH)
NORM_COV           normalized linear depth-of-coverage (LINEAR_DOC / SUM(LINEAR_DOC in certain level))
=================  ==============================

Other than a summary table, "full" report mode will report a table with more
detail information from unfiltered results.  The explanation of each column in
the full report is as follows:

==================================  ==========================
Column                              Description
==================================  ==========================
RANKNAME                            (REPLICON)  = replicon name (source + plasmid/chr)<br>(STRAIN)    = strain name<br>(SPECIES)   = species name<br>(GENUS)     = genus name<br>...
NUM_SUBRANKS                        no. of distinct subranks for the current rank<br>(E.g.  the no. of SPECIES under the current GENUS)
GPROJ_ENTRIES                       no. of genome projects (i.e. STRAINS) under this RANK NAME
LINEAR_LENGTH                       N/O_LENGTH<br>= non-overlapping length <br>= no. of non-overlapping bases covering the unique DB
UNIQUE_DB_LENGTH                    no. of unique bases for this organism
FULL_REFDB_LENGTH                   no. of bases in full reference
LINEAR_COV                          LINEAR_LENGTH / UNIQUE_DB_LENGTH
HIT_COUNT                           no. of hits recruited to genome
HIT_COUNT_PLASMID                   no. of hits recruited to plasmid
READ_COUNT                          no. of reads recruited to genome
FULL_HIT_COUNT                      no. of full-length read hits recruited to genome
TOTAL_BP_MAPPED                     sum total of all hit lengths recruited to genome<br>= hit1.length + hit2.length + ... hitX.length<br>[formerly FOLD_COV_UNIQUE_SAMPLE]
LINEAR_DOC                          linear depth-of-coverage<br>= fold coverage of sample's LINEAR_LENGTH <br>= TOTAL_BP_MAPPED / LINEAR_LENGTH<br>[formerly FOLD_COV_UNIQUE_REFDB]
UREF_DOC                            unique reference's depth-of-coverage<br>= fold coverage of reference's UNIQUE_DB_LENGTH<br>= TOTAL_BP_MAPPED / UNIQUE_DB_LENGTH
UREF_CMAX                           MAX COVERAGE OF REFDB POSSIBLE, GIVEN SAMPLE INPUT BASES<br>= Cmax = L0/l0 <br>= TOTAL_INPUT_BASES / UNIQUE_DB_LENGTH
FRAC_HITS_POSSIBLE                  HIT_COUNT / TOTAL_INPUT_READS
FRAC_BASES_POSSIBLE                 TOTAL_BP_MAPPED / TOTAL_INPUT_BASES
MEAN_HIT_LENGTH                     TOTAL_BP_MAPPED / HIT_COUNT
MEAN_LINEAR_HIT_LENGTH              LINEAR_LENGTH / HIT_COUNT
best_SUBRANK                        name of the best subrank (determined by the highest LINEAR_COV)
best_NUM_SUBRANKS                   no. of subranks supporting current "SUBRANK"<br>{SS} = no. of GI entries supporting this strain<br>{S}  = no. of strains supporting this species<br>{G}  = no. of species supporting this genus<br>{F}  = no. of genera supporting this family<br>{O}  = no. of families supporting this order <br>{C}  = no. of orders supporting this class<br>{P}  = no. of classes supporting this phylum
best_GPROJ_ENTRIES                  no. of genome projects (i.e. STRAINS) under this best_SUBRANK<br>{SS} = no. of genome projects supporting this strain = 1<br>{S}  = no. of genome projects supporting this species<br>{G}  = no. of genome projects supporting this genus<br>{F}  = no. of genome projects supporting this family<br>{O}  = no. of genome projects supporting this order<br>{C}  = no. of genome projects supporting this class<br>{P}  = no. of genome projects supporting this phylum
best_LINEAR_LENGTH
best_UNIQUE_DB_LENGTH
best_FULL_REFDB_LENGTH
best_LINEAR_COV
best_HIT_COUNT
best_FULL_HIT_COUNT
best_TOTAL_BP_MAPPED
best_LINEAR_DOC (a.k.a. Abundance)
best_UREF_DOC
best_UREF_CMAX
best_FRAC_HITS_POSSIBLE
best_FRAC_BASES_POSSIBLE
best_MEAN_HIT_LENGTH
best_MEAN_LINEAR_HIT_LENGTH
CONTIG_COUNT                        no. of contiguous fragments<br> (after mapping & generating non-overlapping fragments)
CONTIG_MEAN_LEN                     mean length of contigs (bp)
CONTIG_STDEV_LEN                    standard deviation of contig lengths (bp)
CONTIG_MINLEN                       length of smallest contig
CONTIG_MAXLEN                       length of largest contig
CONTIG_HISTOGRAM(LEN:FREQ)          contig Length Histogram<br> (in the format contigLength:frequency)
==================================  ==========================

    ]]>
    </help>

    <!-- ***************************************************************** -->

    <citations>
        <citation type="doi">10.1093/nar/gkv180</citation>
    </citations>

</tool>
author	jdv
date	Mon, 30 Jan 2017 22:00:11 -0500
parents	faff3483da31
children	1b81bee6080f