Mercurial > repos > jdv > gottcha
view gottcha.xml @ 4:957d8ea06b1b draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/gottcha commit 5d24210279e26623ae6c98f7551e3565fdc9bc48-dirty
author | jdv |
---|---|
date | Mon, 30 Jan 2017 22:00:11 -0500 |
parents | faff3483da31 |
children | 1b81bee6080f |
line wrap: on
line source
<tool id="gottcha" name="GOTTCHA" version="0.001"> <description>Read-based metagenome characterization</description> <!-- ***************************************************************** --> <requirements> <requirement type="package" version="0.7.7">bwa</requirement> <requirement type="package" version="1.0b-564cf3b">gottcha</requirement> </requirements> <!-- ***************************************************************** --> <version_command>gottcha.pl -h | perl -wnE'print "$1\n" for /VERSION: (\S+)/g'</version_command> <!-- ***************************************************************** --> <command detect_errors="aggressive"> <![CDATA[ gottcha.pl --input '${fn_in}' --database '${db.fields.path}' --threads '\${GALAXY_SLOTS:-1}' --outdir './' --prefix results ##--General Options------------------------------ --relAbu ${general.rel_abund} --mode ${general.output_full} ${general.filt_plasmid} ##--Split-trim Options--------------------------- --minQ ${split.min_qual} --fixL ${split.fixed_len} --ascii ${split.qual_offset} ##--Filtering Options---------------------------- --minCov ${filter.min_cov} --minMLHL ${filter.min_mlhl} --cCov ${filter.c_cov} --minLen ${filter.min_len} --minHits ${filter.min_hits} ]]> </command> <!-- ***************************************************************** --> <inputs> <param name="fn_in" type="data" format="fastq" label="Input reads" help="--input" /> <param name="db" type="select" label="Select a reference database" help="--database"> <options from_data_table="gottcha_indices"> <filter type="sort_by" column="2"/> <validator type="no_options" message="No indexes are available for the selected input dataset"/> </options> </param> <section name="general" title="General Options" expanded="True"> <param name="rel_abund" type="select" label="Abundance field" help="--relAbu"> <option value="LINEAR_DOC" selected="true">Linear DOC</option> <option value="LINEAR_LENGTH">Linear length</option> <option value="TOTAL_BP_MAPPED">Total bp mapped</option> <option value="HIT_COUNT">Hit count</option> </param> <param name="output_full" type="boolean" truevalue="full" falsevalue="summary" checked="no" label="Output full report" help="--mode full" /> <param name="filt_plasmid" type="boolean" truevalue="--noPlasmidHit" falsevalue="" checked="no" label="Filter plasmid hits" help="If true, ignore alignments to plasmids" /> </section> <section name="split" title="Split-trim Options" expanded="False"> <param name="min_qual" size="4" type="integer" value="20" min="0" max="41" label="Minimum quality" help="Minimum quality for a read to be considered valid (0-41)" /> <param name="fixed_len" size="4" type="integer" value="30" min="1" label="Trim length" help="Fixed length to which each read will be trimmed" /> <param name="qual_offset" type="select" label="Quality offset" help="Base call quality offset for ASCII encoding"> <option value="33" selected="true">33</option> <option value="64">64</option> </param> </section> <section name="filter" title="Filtering Options" expanded="False"> <param name="min_cov" size="5" type="float" value="0.005" min="0" label="Minimum coverage" help="Minimum linear coverage to be considered valid in abundance calculation" /> <param name="min_mlhl" size="4" type="integer" value="5" min="0" label="Minimum MLHL" help="Minimum mean-linear-hit-length to be considered valid in abundance calculations" /> <param name="c_cov" size="5" type="float" value="0.006" min="0" label="Critical coverage for MLHL" help="Critical coverage below which Minimum MLHL will cause an organism to fail" /> <param name="min_len" size="4" type="integer" value="100" min="0" label="Minimum length" help="Minimum unique length to be considered valid in abundance calculation" /> <param name="min_hits" size="4" type="integer" value="10" min="0" label="Minimum hits" help="Minimum number of hits to be considered valid in abundance calculation" /> </section> </inputs> <!-- ***************************************************************** --> <outputs> <data name="out_log" format="txt" label="GOTTCHA on ${on_string}: Log" from_work_dir="results.gottcha.log" /> <data name="out_tsv" format="txt" label="GOTTCHA on ${on_string}: Summary" from_work_dir="results.gottcha.tsv" /> <data name="out_full" format="txt" label="GOTTCHA on ${on_string}: Full Report" from_work_dir="results.gottcha_full.tsv"> <filter>output_full</filter> </data> </outputs> <!-- ***************************************************************** --> <tests> <test> <param name="db" value="test_db" /> <param name="fn_in" ftype="fastq" value="test.fq" /> <param name="output_full" value="no" /> <param name="min_hits" value="1" /> <output name="out_tsv" file="test_02.tsv" /> <output name="out_log" file="test_02.log" compare="sim_size" delta="2000"/> </test> </tests> <!-- ***************************************************************** --> <help> <![CDATA[ **Description** Genomic Origin Through Taxonomic CHAllenge (GOTTCHA) is an annotation-independent and signature-based metagenomic taxonomic profiling tool that has significantly smaller FDR than other profiling tools. This Perl script is a wrapper to run the GOTTCHA profiling tool with pre-computed signature databases. The procedure includes 3 major steps: split-trimming the input data, mapping reads to a GOTTCHA database using BWA, profiling/filtering the result. **Options** :: --relAbu|r <STRING> The field will be used to calculate relative abundance. You can specify one of the following fields: "LINEAR_LENGTH", "TOTAL_BP_MAPPED", "HIT_COUNT", "LINEAR_DOC". [default: LINEAR_DOC] --mode|m <STRING> You can specify one of the output mode: "summary" : this mode will report a summary of profiling result to *.gottcha.tsv file. "full" : other than a summary, this mode will report unfiltered result to *.gottcha_full.tsv with more detail. "all" : other than two tables, this mode will keep all output files that were generated by each profiling step. [default: summary] --noPlasmidHit|n Ignore alignments that hit to plasmids [default: null] *** OPTIONS FOR SPLIT-TRIMMING READS *** --minQ <INT> Minimum quality for a read to be considered valid (0-41) [default: 20] --fixL <INT> Fixed length to which each trimmed read will be cut down to [default: 30] --ascii <INT> ASCII encoding of quality score (33 or 64) [default: 33] *** OPTIONS FOR FILTERING PROFILING RESULT *** --minCov <FLOAT> Minimum linear coverage to be considered valid in abundance calculation [default: 0.005] --minMLHL <INT> Minimum Mean-Linear-Hit-Length to be considered valid in abundance calculation [default: 5] --cCov <FLOAT> Critical coverage below which --minMLHL will cause an organism to fail [default: 0.006] --minLen <INT> Minimum unique length to be considered valid in abundance calculation [default: 100] --minHits <INT> Minimum number of hits to be considered valid in abundance calculation [10] **Interpreting Results** GOTTCHA reports profiling results in a neat summary table by default. The tsv file will list the organism(s) at all taxonomic levels from STRAIN to PHYLUM, their linear length, total bases mapped, linear depth of coverage, and the normalized linear depth of coverage. The linear depth of coverage (LINEAR_DOC) is used to calculate relative abundance of each organism or taxonomic name in the sample. Summary table: ================= ============================== Column Description ================= ============================== LEVEL taxonomic rank NAME taxonomic name REL_ABUNDANCE relative abundance (equivalent to NORM_COV by default) LINEAR_LENGTH number of non-overlapping bases covering the signatures TOTAL_BP_MAPPED sum total of all hit lengths recruited to signatures HIT_COUNT number of hits recruited to signatures HIT_COUNT_PLASMID number of hits recruited to signatures READ_COUNT number of reads recruited to signatures LINEAR_DOC linear depth-of-coverage (TOTAL_BP_MAPPED / LINEAR_LENGTH) NORM_COV normalized linear depth-of-coverage (LINEAR_DOC / SUM(LINEAR_DOC in certain level)) ================= ============================== Other than a summary table, "full" report mode will report a table with more detail information from unfiltered results. The explanation of each column in the full report is as follows: ================================== ========================== Column Description ================================== ========================== RANKNAME (REPLICON) = replicon name (source + plasmid/chr)<br>(STRAIN) = strain name<br>(SPECIES) = species name<br>(GENUS) = genus name<br>... NUM_SUBRANKS no. of distinct subranks for the current rank<br>(E.g. the no. of SPECIES under the current GENUS) GPROJ_ENTRIES no. of genome projects (i.e. STRAINS) under this RANK NAME LINEAR_LENGTH N/O_LENGTH<br>= non-overlapping length <br>= no. of non-overlapping bases covering the unique DB UNIQUE_DB_LENGTH no. of unique bases for this organism FULL_REFDB_LENGTH no. of bases in full reference LINEAR_COV LINEAR_LENGTH / UNIQUE_DB_LENGTH HIT_COUNT no. of hits recruited to genome HIT_COUNT_PLASMID no. of hits recruited to plasmid READ_COUNT no. of reads recruited to genome FULL_HIT_COUNT no. of full-length read hits recruited to genome TOTAL_BP_MAPPED sum total of all hit lengths recruited to genome<br>= hit1.length + hit2.length + ... hitX.length<br>[formerly FOLD_COV_UNIQUE_SAMPLE] LINEAR_DOC linear depth-of-coverage<br>= fold coverage of sample's LINEAR_LENGTH <br>= TOTAL_BP_MAPPED / LINEAR_LENGTH<br>[formerly FOLD_COV_UNIQUE_REFDB] UREF_DOC unique reference's depth-of-coverage<br>= fold coverage of reference's UNIQUE_DB_LENGTH<br>= TOTAL_BP_MAPPED / UNIQUE_DB_LENGTH UREF_CMAX MAX COVERAGE OF REFDB POSSIBLE, GIVEN SAMPLE INPUT BASES<br>= Cmax = L0/l0 <br>= TOTAL_INPUT_BASES / UNIQUE_DB_LENGTH FRAC_HITS_POSSIBLE HIT_COUNT / TOTAL_INPUT_READS FRAC_BASES_POSSIBLE TOTAL_BP_MAPPED / TOTAL_INPUT_BASES MEAN_HIT_LENGTH TOTAL_BP_MAPPED / HIT_COUNT MEAN_LINEAR_HIT_LENGTH LINEAR_LENGTH / HIT_COUNT best_SUBRANK name of the best subrank (determined by the highest LINEAR_COV) best_NUM_SUBRANKS no. of subranks supporting current "SUBRANK"<br>{SS} = no. of GI entries supporting this strain<br>{S} = no. of strains supporting this species<br>{G} = no. of species supporting this genus<br>{F} = no. of genera supporting this family<br>{O} = no. of families supporting this order <br>{C} = no. of orders supporting this class<br>{P} = no. of classes supporting this phylum best_GPROJ_ENTRIES no. of genome projects (i.e. STRAINS) under this best_SUBRANK<br>{SS} = no. of genome projects supporting this strain = 1<br>{S} = no. of genome projects supporting this species<br>{G} = no. of genome projects supporting this genus<br>{F} = no. of genome projects supporting this family<br>{O} = no. of genome projects supporting this order<br>{C} = no. of genome projects supporting this class<br>{P} = no. of genome projects supporting this phylum best_LINEAR_LENGTH best_UNIQUE_DB_LENGTH best_FULL_REFDB_LENGTH best_LINEAR_COV best_HIT_COUNT best_FULL_HIT_COUNT best_TOTAL_BP_MAPPED best_LINEAR_DOC (a.k.a. Abundance) best_UREF_DOC best_UREF_CMAX best_FRAC_HITS_POSSIBLE best_FRAC_BASES_POSSIBLE best_MEAN_HIT_LENGTH best_MEAN_LINEAR_HIT_LENGTH CONTIG_COUNT no. of contiguous fragments<br> (after mapping & generating non-overlapping fragments) CONTIG_MEAN_LEN mean length of contigs (bp) CONTIG_STDEV_LEN standard deviation of contig lengths (bp) CONTIG_MINLEN length of smallest contig CONTIG_MAXLEN length of largest contig CONTIG_HISTOGRAM(LEN:FREQ) contig Length Histogram<br> (in the format contigLength:frequency) ================================== ========================== ]]> </help> <!-- ***************************************************************** --> <citations> <citation type="doi">10.1093/nar/gkv180</citation> </citations> </tool>