Mercurial > repos > jdv > gottcha
diff gottcha.xml @ 0:2569a83977f5 draft
planemo upload for repository https://github.com/jvolkening/galaxy-tools/tree/master/tools/gottcha commit 5d24210279e26623ae6c98f7551e3565fdc9bc48
author | jdv |
---|---|
date | Mon, 30 Jan 2017 19:07:21 -0500 |
parents | |
children | 87efdde6105f |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/gottcha.xml Mon Jan 30 19:07:21 2017 -0500 @@ -0,0 +1,267 @@ +<tool id="gottcha" name="GOTTCHA" version="0.0.1"> + + <description>Read-based metagenome characterization</description> + + <!-- ***************************************************************** --> + + <requirements> + <requirement type="package" version="1.0b-564cf3b">gottcha</requirement> + </requirements> + + <!-- ***************************************************************** --> + + <version_command>gottcha.pl -h | perl -wnE'print "$1\n" for /VERSION: (\S+)/g'</version_command> + + <!-- ***************************************************************** --> + + <command detect_errors="aggressive"> + <![CDATA[ + + gottcha.pl + + --input '${fn_in}' + --database '${db.fields.path}' + --threads '\${GALAXY_SLOTS:-1}' + --outdir './' + --prefix results + + ##--General Options------------------------------ + + --relAbu ${general.rel_abund} + --mode ${general.output_full} + ${general.filt_plasmid} + + ##--Split-trim Options--------------------------- + + --minQ ${split.min_qual} + --fixL ${split.fixed_len} + --ascii ${split.qual_offset} + + ##--Filtering Options---------------------------- + + --minCov ${filter.min_cov} + --minMLHL ${filter.min_mlhl} + --cCov ${filter.c_cov} + --minLen ${filter.min_len} + --minHits ${filter.min_hits} + + ]]> + </command> + + <!-- ***************************************************************** --> + + <inputs> + + <param name="fn_in" type="data" format="fastq" label="Input reads" help="--input" /> + <param name="db" type="select" label="Select a reference database" help="--database"> + <options from_data_table="gottcha_indices"> + <filter type="sort_by" column="2"/> + <validator type="no_options" message="No indexes are available for the selected input dataset"/> + </options> + </param> + + <section name="general" title="General Options" expanded="True"> + <param name="rel_abund" type="select" label="Abundance field" help="--relAbu"> + <option value="LINEAR_DOC" selected="true">Linear DOC</option> + <option value="LINEAR_LENGTH">Linear length</option> + <option value="TOTAL_BP_MAPPED">Total bp mapped</option> + <option value="HIT_COUNT">Hit count</option> + </param> + <param name="output_full" type="boolean" truevalue="full" falsevalue="summary" checked="no" label="Output full report" help="--mode full" /> + <param name="filt_plasmid" type="boolean" truevalue="--noPlasmidHit" falsevalue="" checked="no" label="Filter plasmid hits" help="If true, ignore alignments to plasmids" /> + </section> + + <section name="split" title="Split-trim Options" expanded="False"> + <param name="min_qual" size="4" type="integer" value="20" min="0" max="41" label="Minimum quality" help="Minimum quality for a read to be considered valid (0-41)" /> + <param name="fixed_len" size="4" type="integer" value="30" min="1" label="Trim length" help="Fixed length to which each read will be trimmed" /> + <param name="qual_offset" type="select" label="Quality offset" help="Base call quality offset for ASCII encoding"> + <option value="33" selected="true">33</option> + <option value="64">64</option> + </param> + </section> + + <section name="filter" title="Filtering Options" expanded="False"> + <param name="min_cov" size="5" type="float" value="0.005" min="0" label="Minimum coverage" help="Minimum linear coverage to be considered valid in abundance calculation" /> + <param name="min_mlhl" size="4" type="integer" value="5" min="0" label="Minimum MLHL" help="Minimum mean-linear-hit-length to be considered valid in abundance calculations" /> + <param name="c_cov" size="5" type="float" value="0.006" min="0" label="Critical coverage for MLHL" help="Critical coverage below which Minimum MLHL will cause an organism to fail" /> + <param name="min_len" size="4" type="integer" value="100" min="0" label="Minimum length" help="Minimum unique length to be considered valid in abundance calculation" /> + <param name="min_hits" size="4" type="integer" value="10" min="0" label="Minimum hits" help="Minimum number of hits to be considered valid in abundance calculation" /> + </section> + + </inputs> + + <!-- ***************************************************************** --> + + <outputs> + + <data name="out_log" format="txt" label="GOTTCHA on ${on_string}: Log" from_work_dir="results.gottcha.log" /> + <data name="out_tsv" format="txt" label="GOTTCHA on ${on_string}: Summary" from_work_dir="results.gottcha.tsv" /> + <data name="out_full" format="txt" label="GOTTCHA on ${on_string}: Full Report" from_work_dir="results.gottcha_full.tsv"> + <filter>output_full</filter> + </data> + + </outputs> + + <!-- ***************************************************************** --> + + <tests> + <test> + <param name="db" value="test_db" /> + <param name="fn_in" ftype="fastq" value="test.fq" /> + <param name="output_full" value="no" /> + <param name="min_hits" value="1" /> + <output name="out_tsv" file="test_02.tsv" /> + <output name="out_log" file="test_02.log" compare="sim_size" delta="2000"/> + </test> + </tests> + + <!-- ***************************************************************** --> + + <help> + <![CDATA[ + +.. class:: infomark + +Description +-------------------- + +Genomic Origin Through Taxonomic CHAllenge (GOTTCHA) is an +annotation-independent and signature-based metagenomic taxonomic profiling +tool that has significantly smaller FDR than other profiling tools. This Perl +script is a wrapper to run the GOTTCHA profiling tool with pre-computed +signature databases. The procedure includes 3 major steps: split-trimming the +input data, mapping reads to a GOTTCHA database using BWA, profiling/filtering +the result. + +Options +-------------------- +:: + + --relAbu|r <STRING> The field will be used to calculate relative + abundance. You can specify one of the following + fields: "LINEAR_LENGTH", "TOTAL_BP_MAPPED", + "HIT_COUNT", "LINEAR_DOC". + [default: LINEAR_DOC] + --mode|m <STRING> You can specify one of the output mode: + "summary" : this mode will report a summary of + profiling result to *.gottcha.tsv file. + "full" : other than a summary, this mode will + report unfiltered result to + *.gottcha_full.tsv with more detail. + "all" : other than two tables, this mode will + keep all output files that were + generated by each profiling step. + [default: summary] + --noPlasmidHit|n Ignore alignments that hit to plasmids + [default: null] + + *** OPTIONS FOR SPLIT-TRIMMING READS *** + + --minQ <INT> Minimum quality for a read to be considered valid + (0-41) [default: 20] + --fixL <INT> Fixed length to which each trimmed read will be cut + down to [default: 30] + --ascii <INT> ASCII encoding of quality score (33 or 64) [default: + 33] + + *** OPTIONS FOR FILTERING PROFILING RESULT *** + + --minCov <FLOAT> Minimum linear coverage to be considered valid in + abundance calculation [default: 0.005] + --minMLHL <INT> Minimum Mean-Linear-Hit-Length to be considered valid + in abundance calculation [default: 5] + --cCov <FLOAT> Critical coverage below which --minMLHL will cause an + organism to fail [default: 0.006] + --minLen <INT> Minimum unique length to be considered valid in + abundance calculation [default: 100] + --minHits <INT> Minimum number of hits to be considered valid in + abundance calculation [10] + +Interpreting Results +-------------------- + +GOTTCHA reports profiling results in a neat summary table +by default. The tsv file will list the organism(s) at all taxonomic +levels from STRAIN to PHYLUM, their linear length, total bases mapped, +linear depth of coverage, and the normalized linear depth of coverage. The +linear depth of coverage (LINEAR_DOC) is used to calculate relative +abundance of each organism or taxonomic name in the sample. + +Summary table: + +================= ============================== +Column Description +================= ============================== +LEVEL taxonomic rank +NAME taxonomic name +REL_ABUNDANCE relative abundance (equivalent to NORM_COV by default) +LINEAR_LENGTH number of non-overlapping bases covering the signatures +TOTAL_BP_MAPPED sum total of all hit lengths recruited to signatures +HIT_COUNT number of hits recruited to signatures +HIT_COUNT_PLASMID number of hits recruited to signatures +READ_COUNT number of reads recruited to signatures +LINEAR_DOC linear depth-of-coverage (TOTAL_BP_MAPPED / LINEAR_LENGTH) +NORM_COV normalized linear depth-of-coverage (LINEAR_DOC / SUM(LINEAR_DOC in certain level)) +================= ============================== + +Other than a summary table, "full" report mode will report a table with more +detail information from unfiltered results. The explanation of each column in +the full report is as follows: + +================================== ========================== +Column Description +================================== ========================== +RANKNAME (REPLICON) = replicon name (source + plasmid/chr)<br>(STRAIN) = strain name<br>(SPECIES) = species name<br>(GENUS) = genus name<br>... +NUM_SUBRANKS no. of distinct subranks for the current rank<br>(E.g. the no. of SPECIES under the current GENUS) +GPROJ_ENTRIES no. of genome projects (i.e. STRAINS) under this RANK NAME +LINEAR_LENGTH N/O_LENGTH<br>= non-overlapping length <br>= no. of non-overlapping bases covering the unique DB +UNIQUE_DB_LENGTH no. of unique bases for this organism +FULL_REFDB_LENGTH no. of bases in full reference +LINEAR_COV LINEAR_LENGTH / UNIQUE_DB_LENGTH +HIT_COUNT no. of hits recruited to genome +HIT_COUNT_PLASMID no. of hits recruited to plasmid +READ_COUNT no. of reads recruited to genome +FULL_HIT_COUNT no. of full-length read hits recruited to genome +TOTAL_BP_MAPPED sum total of all hit lengths recruited to genome<br>= hit1.length + hit2.length + ... hitX.length<br>[formerly FOLD_COV_UNIQUE_SAMPLE] +LINEAR_DOC linear depth-of-coverage<br>= fold coverage of sample's LINEAR_LENGTH <br>= TOTAL_BP_MAPPED / LINEAR_LENGTH<br>[formerly FOLD_COV_UNIQUE_REFDB] +UREF_DOC unique reference's depth-of-coverage<br>= fold coverage of reference's UNIQUE_DB_LENGTH<br>= TOTAL_BP_MAPPED / UNIQUE_DB_LENGTH +UREF_CMAX MAX COVERAGE OF REFDB POSSIBLE, GIVEN SAMPLE INPUT BASES<br>= Cmax = L0/l0 <br>= TOTAL_INPUT_BASES / UNIQUE_DB_LENGTH +FRAC_HITS_POSSIBLE HIT_COUNT / TOTAL_INPUT_READS +FRAC_BASES_POSSIBLE TOTAL_BP_MAPPED / TOTAL_INPUT_BASES +MEAN_HIT_LENGTH TOTAL_BP_MAPPED / HIT_COUNT +MEAN_LINEAR_HIT_LENGTH LINEAR_LENGTH / HIT_COUNT +best_SUBRANK name of the best subrank (determined by the highest LINEAR_COV) +best_NUM_SUBRANKS no. of subranks supporting current "SUBRANK"<br>{SS} = no. of GI entries supporting this strain<br>{S} = no. of strains supporting this species<br>{G} = no. of species supporting this genus<br>{F} = no. of genera supporting this family<br>{O} = no. of families supporting this order <br>{C} = no. of orders supporting this class<br>{P} = no. of classes supporting this phylum +best_GPROJ_ENTRIES no. of genome projects (i.e. STRAINS) under this best_SUBRANK<br>{SS} = no. of genome projects supporting this strain = 1<br>{S} = no. of genome projects supporting this species<br>{G} = no. of genome projects supporting this genus<br>{F} = no. of genome projects supporting this family<br>{O} = no. of genome projects supporting this order<br>{C} = no. of genome projects supporting this class<br>{P} = no. of genome projects supporting this phylum +best_LINEAR_LENGTH +best_UNIQUE_DB_LENGTH +best_FULL_REFDB_LENGTH +best_LINEAR_COV +best_HIT_COUNT +best_FULL_HIT_COUNT +best_TOTAL_BP_MAPPED +best_LINEAR_DOC (a.k.a. Abundance) +best_UREF_DOC +best_UREF_CMAX +best_FRAC_HITS_POSSIBLE +best_FRAC_BASES_POSSIBLE +best_MEAN_HIT_LENGTH +best_MEAN_LINEAR_HIT_LENGTH +CONTIG_COUNT no. of contiguous fragments<br> (after mapping & generating non-overlapping fragments) +CONTIG_MEAN_LEN mean length of contigs (bp) +CONTIG_STDEV_LEN standard deviation of contig lengths (bp) +CONTIG_MINLEN length of smallest contig +CONTIG_MAXLEN length of largest contig +CONTIG_HISTOGRAM(LEN:FREQ) contig Length Histogram<br> (in the format contigLength:frequency) +================================== ========================== + + ]]> + </help> + + <!-- ***************************************************************** --> + + <citations> + <citation type="doi">10.1093/nar/gkv180</citation> + </citations> + +</tool>