drep_dereplicate: macros.xml comparison

comparison macros.xml @ 0:aba9d1e647b6 draft

"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 8fa5ff35b45c2b046c7f4800410cf39cb89a299a"

author	iuc
date	Tue, 05 May 2020 09:53:33 +0000
parents
children	e9621d0f4e6b

comparison

equal deleted inserted replaced

--1:000000000000
+:aba9d1e647b6
+<macros>
+<token name="@VERSION@">2.5.4</token>
+<xml name="requirements">
+<requirements>
+<requirement type="package" version="@VERSION@">drep</requirement>
+<yield/>
+</requirements>
+</xml>
+<xml name="citations">
+<citations>
+<citation type="doi">10.1038/ismej.2017.126</citation>
+<yield />
+</citations>
+</xml>
+<xml name="genomes">
+<param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/>
+</xml>
+<token name="@PREPARE_GENOMES@"><![CDATA[
+#import re
+#set $genomefiles = []
+#for $genome in $genomes
+#set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1]))
+ln -s '${genome}' '${input_name}' &&
+$genomefiles.append($input_name)
+#end for
+]]></token>
+<token name="@GENOMES@"><![CDATA[
+-g
+#for $genomefile in $genomefiles
+'${genomefile}'
+#end for
+]]></token>
+<xml name="checkm_method">
+<param argument="--checkM_method" type="select" label="checkm method" optional="true">
+<option value="taxonomy_wf">taxonomy_wf (faster)</option>
+<option value="lineage_wf">lineage_wf (more accurate)</option>
+</param>
+</xml>
+<token name="@CHECKM_METHOD@"><![CDATA[
+#if $checkM_method:
+--checkM_method $checkM_method
+#end if
+]]></token>
+<xml name="filtering_options">
+<conditional name="filter">
+<param name="set_options" type="select" label="set filtering options">
+<option value="yes">Yes</option>
+<option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option>
+</param>
+<when value="yes">
+<param argument="--length" type="integer" value="50000" label="Minimum genome length"/>
+<param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/>
+<param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/>
+<conditional name="quality">
+<param argument="source" type="select" label="genome quality">
+<help>
+--ignoreGenomeQuality is useful with
+bacteriophages or eukaryotes or things where checkM
+scoring does not work. Will only choose genomes based
+on length and N50.
+</help>
+<option value="checkm" selected="true">Run checkM</option>
+<option value="genomeInfo">User supplied genomeInfo csv file</option>
+<option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option>
+</param>
+<when value="checkm">
+<param argument="--checkM_method" type="select" label="checkm method" optional="true">
+<help>
+Using the checkm method of lineage_wf can require more than 40Gb of RAM.
+</help>
+<option value="taxonomy_wf">taxonomy_wf (faster)</option>
+<option value="lineage_wf">lineage_wf (more accurate)</option>
+</param>
+</when>
+<when value="genomeInfo">
+<param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files">
+<help><![CDATA[
+A CSV dataset that must contain: [
+"genome"(history dataset name of .fasta dataset of that genome),
+"completeness"(0-100 value for completeness of the genome),
+"contamination"(0-100 value of the contamination of the genome)]
+]]></help>
+</param>
+</when>
+<when value="ignoreGenomeQuality"/>
+</conditional>
+</when>
+<when value="no"/>
+</conditional>
+</xml>
+<token name="@FILTER_OPTIONS@"><![CDATA[
+#if $filter.set_options == 'yes':
+--length $filter.length
+--completeness $filter.completeness
+--contamination $filter.contamination
+#if $filter.quality.source == 'checkm'
+--checkM_method $filter.quality.checkM_method
+#elif $filter.quality.source == 'genomeInfo'
+--genomeInfo $filter.quality.genomeInfo
+#elif $filter.quality.source == 'ignoreGenomeQuality'
+--ignoreGenomeQuality
+#end if
+#else
+--checkM_method taxonomy_wf
+#end if
+]]></token>
+<xml name="genome_comparison_options">
+<conditional name="genome_comparison">
+<param name="set_options" type="select" label="set genome comparison options">
+<option value="yes">Yes</option>
+<option value="no" selected="true">No</option>
+</param>
+<when value="yes">
+<param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/>
+<param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons">
+<option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option>
+<option value="ANIn">ANIn  = Align whole genomes with nucmer; compare aligned regions</option>
+<option value="gANI">gANI  = Identify and align ORFs; compare aligned ORFS</option>
+</param>
+<param argument="-n_PRESET" type="select" label="Presets to pass to nucmer">
+<option value="normal" selected="true">normal  = default ANIn parameters (default: normal)</option>
+<option value="tight">tight   = only align highly conserved regions</option>
+</param>
+</when>
+<when value="no"/>
+</conditional>
+</xml>
+<token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[
+#if $genome_comparison.set_options == 'yes':
+--MASH_sketch $genome_comparison.MASH_sketch
+--S_algorithm $genome_comparison.S_algorithm
+-n_PRESET $genome_comparison.n_PRESET
+#end if
+]]></token>
+<xml name="clustering_options">
+<conditional name="clustering">
+<param name="set_options" type="select" label="set clustering options">
+<option value="yes">Yes</option>
+<option value="no" selected="true">No</option>
+</param>
+<when value="yes">
+<param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/>
+<param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/>
+<param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/>
+<param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/>
+<param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/>
+<param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment">
+<help>(for ANIn/ANImf only; gANI can only do larger method)</help>
+<option value="larger" selected="true">arger  = max((aligned length / genome 1), (aligned_length / genome2))</option>
+<option value="total">total   = 2*(aligned length) / (sum of total genome lengths)</option>
+</param>
+<param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes">
+<help>(passed to  scipy.cluster.hierarchy.linkage)</help>
+<option value="average" selected="true">average</option>
+</param>
+</when>
+<when value="no"/>
+</conditional>
+</xml>
+<token name="@CLUSTERING_OPTIONS@"><![CDATA[
+#if $clustering.set_options == 'yes':
+--P_ani $clustering.P_ani
+--S_ani $clustering.S_ani
+$clustering.SkipMash
+$clustering.SkipSecondary
+--cov_thresh $clustering.cov_thresh
+--coverage_method $clustering.coverage_method
+--clusterAlg $clustering.clusterAlg
+#end if
+]]></token>
+<xml name="scoring_options">
+<conditional name="scoring">
+<param name="set_options" type="select" label="set scoring options">
+<option value="yes">Yes</option>
+<option value="no" selected="true">No</option>
+</param>
+<when value="yes">
+<param argument="--completeness_weight" type="float" value="1" label="completeness weight">
+<help>
+Based off of the formula:
+A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
+A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight;
+</help>
+</param>
+<param argument="--contamination_weight" type="float" value="5" label="contamination weight"/>
+<param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/>
+<param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/>
+<param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/>
+</when>
+<when value="no"/>
+</conditional>
+</xml>
+<token name="@SCORING_OPTIONS@"><![CDATA[
+#if $scoring.set_options == 'yes':
+--completeness_weight $scoring.completeness_weight
+--contamination_weight $scoring.contamination_weight
+--strain_heterogeneity_weight $scoring.strain_heterogeneity_weight
+--N50_weight $scoring.N50_weight
+--size_weight $scoring.size_weight
+#end if
+]]></token>
+<xml name="taxonomy_options">
+<conditional name="taxonomy">
+<param name="set_options" type="select" label="generate taxonomy information">
+<option value="yes">Yes</option>
+<option value="no" selected="true">No</option>
+</param>
+<when value="yes">
+<param argument="--tax_method" type="select" label="Method of determining taxonomy">
+<help>(for ANIn/ANImf only; gANI can only do larger method)</help>
+<option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option>
+<option value="max">max = The centrifuge taxonomic level with the most overall hits</option>
+</param>
+<param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/>
+<param argument="--cent_index" type="data" format="" label="centrifuge index"/>
+</when>
+<when value="no"/>
+</conditional>
+</xml>
+<token name="@TAXONOMY_OPTIONS@"><![CDATA[
+#if $taxonomy.set_options == 'yes':
+--run_tax
+--tax_method $taxonomy.tax_method
+--percent $taxonomy.percent
+--cent_index $taxonomy.cent_index
+#end if
+]]></token>
+<xml name="warning_options">
+<conditional name="warning">
+<param name="set_options" type="select" label="set warning options">
+<option value="yes">Yes</option>
+<option value="no" selected="true">No</option>
+</param>
+<when value="yes">
+<param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/>
+<param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/>
+<param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/>
+</when>
+<when value="no"/>
+</conditional>
+</xml>
+<token name="@WARNING_OPTIONS@"><![CDATA[
+#if $warning.set_options == 'yes':
+--warn_dist $warning.warn_dist
+--warn_sim $warning.warn_sim
+--warn_aln $warning.warn_aln
+#end if
+]]></token>
+<xml name="select_outputs">
+<param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs">
+<option value="log" selected="true">log</option>
+<option value="warnings" selected="true">Warnings</option>
+<option value="Primary_clustering_dendrogram" selected="true">Primary_clustering_dendrogram.pdf</option>
+<option value="Secondary_clustering_dendrograms">Secondary_clustering_dendrograms.pdf</option>
+<option value="Secondary_clustering_MDS">Secondary_clustering_MDS.pdf</option>
+<option value="Clustering_scatterplots" selected="true">Clustering_scatterplots.pdf</option>
+<yield/>
+</param>
+</xml>
+<xml name="select_drep_outputs">
+<expand macro="select_outputs">
+<option value="Cluster_scoring">Cluster_scoring.pdf</option>
+<option value="Winning_genomes">Winning_genomes.pdf</option>
+<option value="Widb">Widb.csv</option>
+<option value="Chdb">Chdb.tsv</option>
+</expand>
+</xml>
+<xml name="common_outputs">
+<data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log">
+<filter>'log' in select_outputs or not select_outputs</filter>
+</data>
+<data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt">
+<filter>'warnings' in select_outputs</filter>
+</data>
+<data name="Primary_clustering_dendrogram" format="pdf" label="${tool.name} on ${on_string}: Primary_clustering_dendrogram.pdf" from_work_dir="outdir/figures/Primary_clustering_dendrogram.pdf">
+<filter>'Primary_clustering_dendrogram' in select_outputs</filter>
+</data>
+<data name="Secondary_clustering_dendrograms" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_dendrograms.pdf" from_work_dir="outdir/figures/Secondary_clustering_dendrograms.pdf">
+<filter>'Secondary_clustering_dendrograms' in select_outputs</filter>
+</data>
+<data name="Secondary_clustering_MDS" format="pdf" label="${tool.name} on ${on_string}: Secondary_clustering_MDS.pdf" from_work_dir="outdir/figures/Secondary_clustering_MDS.pdf">
+<filter>'Secondary_clustering_MDS' in select_outputs</filter>
+</data>
+<data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf">
+<filter>'Clustering_scatterplots' in select_outputs</filter>
+</data>
+</xml>
+<xml name="drep_outputs">
+<expand macro="common_outputs"/>
+<data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf">
+<filter>'Cluster_scoring' in select_outputs</filter>
+</data>
+<data name="Winning_genomes" format="pdf" label="${tool.name} on ${on_string}: Winning_genomes.pdf" from_work_dir="outdir/figures/Winning_genomes.pdf">
+<filter>'Winning_genomes' in select_outputs</filter>
+</data>
+<data name="Widb" format="csv" label="${tool.name} on ${on_string}: Widb.csv" from_work_dir="outdir/data_tables/Widb.csv">
+<filter>'Widb' in select_outputs</filter>
+</data>
+<data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv">
+<filter>'Chdb' in select_outputs</filter>
+</data>
+</xml>
+<xml name="test_defaults_log">
+<test>
+<param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/>
+<output name="log">
+<assert_contents>
+<yield/>
+</assert_contents>
+</output>
+</test>
+</xml>
+<token name="@GENOMES_HELP@"><![CDATA[
+I/O PARAMETERS:
+-g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]]
+genomes to cluster in .fasta format
+(default: None)
+]]></token>
+<token name="@FILTERING_HELP@"><![CDATA[
+FILTERING OPTIONS:
+-l LENGTH, --length LENGTH
+Minimum genome length
+(default: 50000)
+-comp COMPLETENESS, --completeness COMPLETENESS
+Minumum genome completeness
+(default: 75)
+-con CONTAMINATION, --contamination CONTAMINATION
+Maximum genome contamination
+(default: 25)
+--ignoreGenomeQuality
+Don't run checkM or do any quality filtering. NOT
+RECOMMENDED! This is useful for use with
+bacteriophages or eukaryotes or things where checkM
+scoring does not work. Will only choose genomes based
+on length and N50 (default: False)
+]]></token>
+<token name="@GENOME_COMPARISON_HELP@"><![CDATA[
+GENOME COMPARISON PARAMETERS:
+-ms MASH_SKETCH, --MASH_sketch MASH_SKETCH
+MASH sketch size (default: 1000)
+--S_algorithm {goANI,ANIn,ANImf,gANI}
+Algorithm for secondary clustering comaprisons:
+ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions
+ANIn  = Align whole genomes with nucmer; compare aligned regions
+gANI  = Identify and align ORFs; compare aligned ORFS
+(default: ANImf)
+-n_PRESET {normal,tight}
+Presets to pass to nucmer
+tight   = only align highly conserved regions
+normal  = default ANIn parameters (default: normal)
+]]></token>
+<token name="@CLUSTERING_HELP@"><![CDATA[
+CLUSTERING PARAMETERS:
+-pa P_ANI, --P_ani P_ANI
+ANI threshold to form primary (MASH) clusters
+(default: 0.9)
+-sa S_ANI, --S_ani S_ANI
+ANI threshold to form secondary clusters
+(default: 0.99)
+--SkipMash            Skip MASH clustering, just do secondary clustering on
+all genomes (default: False)
+--SkipSecondary       Skip secondary clustering, just perform MASH clustering
+(default: False)
+-nc COV_THRESH, --cov_thresh COV_THRESH
+Minmum level of overlap between genomes when doing
+secondary comparisons (default: 0.1)
+-cm {total,larger}, --coverage_method {total,larger}
+Method to calculate coverage of an alignment
+(for ANIn/ANImf only; gANI can only do larger method)
+total   = 2*(aligned length) / (sum of total genome lengths)
+larger  = max((aligned length / genome 1), (aligned_length / genome2))
+(default: larger)
+--clusterAlg CLUSTERALG
+Algorithm used to cluster genomes (passed to
+scipy.cluster.hierarchy.linkage (default: average)
+]]></token>
+<token name="@SCORING_HELP@"><![CDATA[
+SCORING CRITERIA
+Based off of the formula:
+A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size)
+A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight:
+-comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT
+completeness weight (default: 1)
+-conW CONTAMINATION_WEIGHT, --contamination_weight CONTAMINATION_WEIGHT
+contamination weight (default: 5)
+-strW STRAIN_HETEROGENEITY_WEIGHT, --strain_heterogeneity_weight STRAIN_HETEROGENEITY_WEIGHT
+strain heterogeneity weight (default: 1)
+-N50W N50_WEIGHT, --N50_weight N50_WEIGHT
+weight of log(genome N50) (default: 0.5)
+-sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT
+weight of log(genome size) (default: 0)
+]]></token>
+<token name="@TAXONOMY_HELP@"><![CDATA[
+TAXONOMY:
+--run_tax             generate taxonomy information (Tdb)
+(default: False)
+--tax_method {percent,max}
+Method of determining taxonomy
+percent = The most descriptive taxonimic level with at least (per) hits
+max     = The centrifuge taxonomic level with the most overall hits
+(default: percent)
+-per PERCENT, --percent PERCENT
+minimum percent for percent method
+(default: 50)
+--cent_index CENT_INDEX
+path to centrifuge index (for example,
+/home/mattolm/download/centrifuge/indices/b+h+v
+(default: None)
+]]></token>
+<token name="@WARNINGS_HELP@"><![CDATA[
+WARNINGS:
+--warn_dist WARN_DIST
+How far from the threshold to throw cluster warnings
+(default: 0.25)
+--warn_sim WARN_SIM   Similarity threshold for warnings between dereplicated
+genomes (default: 0.98)
+--warn_aln WARN_ALN   Minimum aligned fraction for warnings between
+dereplicated genomes (ANIn) (default: 0.25)
+]]></token>
+</macros>

Mercurial > repos > iuc > drep_dereplicate

comparison macros.xml @ 0:aba9d1e647b6 draft