Mercurial > repos > iuc > drep_compare
comparison macros.xml @ 2:7de8436f7f97 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/drep commit 5e6e589002d554be180e575080e9ad66cc78ed74"
| author | iuc |
|---|---|
| date | Sat, 12 Feb 2022 17:38:03 +0000 |
| parents | a5054beafded |
| children | 50de1050e717 |
comparison
equal
deleted
inserted
replaced
| 1:a5054beafded | 2:7de8436f7f97 |
|---|---|
| 1 <?xml version="1.0"?> | |
| 1 <macros> | 2 <macros> |
| 2 <token name="@VERSION@">2.5.4</token> | 3 <token name="@TOOL_VERSION@">3.2.2</token> |
| 4 <token name="@VERSION_SUFFIX@">0</token> | |
| 5 <token name="@PROFILE@">20.01</token> | |
| 6 <xml name="biotools"> | |
| 7 <xrefs> | |
| 8 <xref type="bio.tools">drep</xref> | |
| 9 </xrefs> | |
| 10 </xml> | |
| 3 <xml name="requirements"> | 11 <xml name="requirements"> |
| 4 <requirements> | 12 <requirements> |
| 5 <requirement type="package" version="@VERSION@">drep</requirement> | 13 <requirement type="package" version="@TOOL_VERSION@">drep</requirement> |
| 6 <yield/> | 14 <yield/> |
| 7 </requirements> | 15 </requirements> |
| 8 </xml> | 16 </xml> |
| 9 <xml name="citations"> | 17 <xml name="citations"> |
| 10 <citations> | 18 <citations> |
| 11 <citation type="doi">10.1038/ismej.2017.126</citation> | 19 <citation type="doi">10.1038/ismej.2017.126</citation> |
| 12 <yield /> | 20 <yield /> |
| 13 </citations> | 21 </citations> |
| 14 </xml> | 22 </xml> |
| 15 <xml name="bio_tools"> | |
| 16 <xrefs> | |
| 17 <xref type="bio.tools">drep</xref> | |
| 18 </xrefs> | |
| 19 </xml> | |
| 20 | 23 |
| 21 <xml name="genomes"> | 24 <xml name="genomes"> |
| 22 <param argument="--genomes" type="data" format="fasta" label="genomes fasta files" multiple="true"/> | 25 <param argument="--genomes" type="data" format="fasta" multiple="true" label="Genomes to filer"/> |
| 23 </xml> | 26 </xml> |
| 27 | |
| 28 <!-- Addition of ".fasta" after names to avoid string to be read as integer | |
| 29 Bug in dRep: probably fixed in next version --> | |
| 24 <token name="@PREPARE_GENOMES@"><![CDATA[ | 30 <token name="@PREPARE_GENOMES@"><![CDATA[ |
| 25 #import re | 31 #import re |
| 26 #set $genomefiles = [] | 32 #set $genomefiles = [] |
| 27 #for $genome in $genomes | 33 #for $genome in $genomes |
| 28 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) | 34 #set $input_name = $re.sub('[^\w\-_.]', '_',str($genome.element_identifier.split('/')[-1])) |
| 29 ln -s '${genome}' '${input_name}' && | 35 ln -s '${genome}' '${input_name}.fasta' && |
| 30 $genomefiles.append($input_name) | 36 $genomefiles.append($input_name) |
| 31 #end for | 37 #end for |
| 32 ]]></token> | 38 ]]></token> |
| 33 <token name="@GENOMES@"><![CDATA[ | 39 <token name="@GENOMES@"><![CDATA[ |
| 34 -g | 40 -g |
| 35 #for $genomefile in $genomefiles | 41 #for $genomefile in $genomefiles |
| 36 '${genomefile}' | 42 '${genomefile}.fasta' |
| 37 #end for | 43 #end for |
| 38 ]]></token> | 44 ]]></token> |
| 39 | |
| 40 | |
| 41 <xml name="checkm_method"> | |
| 42 <param argument="--checkM_method" type="select" label="checkm method" optional="true"> | |
| 43 <option value="taxonomy_wf">taxonomy_wf (faster)</option> | |
| 44 <option value="lineage_wf">lineage_wf (more accurate)</option> | |
| 45 </param> | |
| 46 </xml> | |
| 47 <token name="@CHECKM_METHOD@"><![CDATA[ | |
| 48 #if $checkM_method: | |
| 49 --checkM_method $checkM_method | |
| 50 #end if | |
| 51 ]]></token> | |
| 52 | 45 |
| 53 <xml name="filtering_options"> | 46 <xml name="filtering_options"> |
| 54 <conditional name="filter"> | 47 <section name="filter" title="Genome filtering" expanded="true"> |
| 55 <param name="set_options" type="select" label="set filtering options"> | 48 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> |
| 56 <option value="yes">Yes</option> | 49 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> |
| 57 <option value="no" selected="true">No (use --checkM_method taxonomy_wf)</option> | 50 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> |
| 51 </section> | |
| 52 </xml> | |
| 53 <xml name="test_default_filtering_options"> | |
| 54 <section name="filter"> | |
| 55 <param name="length" value="50000"/> | |
| 56 <param name="completeness" value="75"/> | |
| 57 <param name="contamination" value="100"/> | |
| 58 </section> | |
| 59 </xml> | |
| 60 <token name="@FILTER_OPTIONS@"><![CDATA[ | |
| 61 --length $filter.length | |
| 62 --completeness $filter.completeness | |
| 63 --contamination $filter.contamination | |
| 64 ]]></token> | |
| 65 | |
| 66 <xml name="quality_assessment_options"> | |
| 67 <conditional name="quality"> | |
| 68 <param name="source" type="select" label="Genome quality filtering" help="No checkM or quality filtering is not recommened but with bacteriophages or eukaryotes or things where checkM scoring does not work. Will only choose genomes based on length and N50."> | |
| 69 <option value="checkm" selected="true">Run checkM</option> | |
| 70 <option value="genomeInfo">Provide quality information on the genome (CSV file)</option> | |
| 71 <option value="ignoreGenomeQuality">Don't run checkM or do any quality filtering (--ignoreGenomeQuality) - NOT RECOMMENDED!</option> | |
| 58 </param> | 72 </param> |
| 59 <when value="yes"> | 73 <when value="checkm"> |
| 60 <param argument="--length" type="integer" value="50000" label="Minimum genome length"/> | 74 <param argument="--checkM_method" type="select" label="CheckM method"> |
| 61 <param argument="--completeness" type="integer" value="75" min="0" max="100" label="Minimum genome completeness percent"/> | 75 <option value="lineage_wf" selected="true">lineage_wf: Lineage-specific Workflow - quality estimates with lineage-specific markers (more accurate)</option> |
| 62 <param argument="--contamination" type="integer" value="25" min="0" max="100" label="Maximum genome contamination percent"/> | 76 <option value="taxonomy_wf">taxonomy_wf: Taxonomic-specific Workflow - quality estimates with taxonomic-specific markers (faster)</option> |
| 63 | 77 </param> |
| 64 <conditional name="quality"> | 78 <param argument="--set_recursion" type="integer" optional="true" label="Increases the python recursion limit" help="NOT RECOMMENDED unless checkM is crashing due to recursion issues. Recommended to set to 2000 if needed, but setting this could crash Python"/> |
| 65 <param argument="source" type="select" label="genome quality"> | 79 <param argument="--checkm_group_size" type="integer" value="2000" min="1" label="Number of genomes passed to checkM at a time" help="Increasing this increases RAM but makes checkM faster"/> |
| 66 <help> | |
| 67 --ignoreGenomeQuality is useful with | |
| 68 bacteriophages or eukaryotes or things where checkM | |
| 69 scoring does not work. Will only choose genomes based | |
| 70 on length and N50. | |
| 71 </help> | |
| 72 <option value="checkm" selected="true">Run checkM</option> | |
| 73 <option value="genomeInfo">User supplied genomeInfo csv file</option> | |
| 74 <option value="ignoreGenomeQuality">--ignoreGenomeQuality (NOT RECOMMENDED!)</option> | |
| 75 </param> | |
| 76 <when value="checkm"> | |
| 77 <param argument="--checkM_method" type="select" label="checkm method" optional="true"> | |
| 78 <help> | |
| 79 Using the checkm method of lineage_wf can require more than 40Gb of RAM. | |
| 80 </help> | |
| 81 <option value="taxonomy_wf">taxonomy_wf (faster)</option> | |
| 82 <option value="lineage_wf">lineage_wf (more accurate)</option> | |
| 83 </param> | |
| 84 </when> | |
| 85 <when value="genomeInfo"> | |
| 86 <param argument="--genomeInfo" type="data" format="csv" label="genomes fasta files"> | |
| 87 <help><![CDATA[ | |
| 88 A CSV dataset that must contain: [ | |
| 89 "genome"(history dataset name of .fasta dataset of that genome), | |
| 90 "completeness"(0-100 value for completeness of the genome), | |
| 91 "contamination"(0-100 value of the contamination of the genome)] | |
| 92 ]]></help> | |
| 93 </param> | |
| 94 </when> | |
| 95 <when value="ignoreGenomeQuality"/> | |
| 96 </conditional> | |
| 97 </when> | 80 </when> |
| 98 <when value="no"/> | 81 <when value="genomeInfo"> |
| 99 </conditional> | 82 <param argument="--genomeInfo" type="data" format="csv" label="Quality information on the genomes"> |
| 100 </xml> | 83 <help><![CDATA[ |
| 101 <token name="@FILTER_OPTIONS@"><![CDATA[ | 84 A CSV dataset that must contain: [ |
| 102 #if $filter.set_options == 'yes': | 85 "genome"(history dataset name of .fasta dataset of that genome), |
| 103 --length $filter.length | 86 "completeness"(0-100 value for completeness of the genome), |
| 104 --completeness $filter.completeness | 87 "contamination"(0-100 value of the contamination of the genome)] |
| 105 --contamination $filter.contamination | 88 ]]></help> |
| 106 #if $filter.quality.source == 'checkm' | |
| 107 --checkM_method $filter.quality.checkM_method | |
| 108 #elif $filter.quality.source == 'genomeInfo' | |
| 109 --genomeInfo $filter.quality.genomeInfo | |
| 110 #elif $filter.quality.source == 'ignoreGenomeQuality' | |
| 111 --ignoreGenomeQuality | |
| 112 #end if | |
| 113 #else | |
| 114 --checkM_method taxonomy_wf | |
| 115 #end if | |
| 116 ]]></token> | |
| 117 | |
| 118 <xml name="genome_comparison_options"> | |
| 119 <conditional name="genome_comparison"> | |
| 120 <param name="set_options" type="select" label="set genome comparison options"> | |
| 121 <option value="yes">Yes</option> | |
| 122 <option value="no" selected="true">No</option> | |
| 123 </param> | |
| 124 <when value="yes"> | |
| 125 <param argument="--MASH_sketch" type="integer" value="1000" label="MASH sketch size"/> | |
| 126 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comaprisons"> | |
| 127 <option value="ANImf" selected="true">ANImf = (RECOMMENDED) Align whole genomes with nucmer; filter alignment; compare aligned regions</option> | |
| 128 <option value="ANIn">ANIn = Align whole genomes with nucmer; compare aligned regions</option> | |
| 129 <option value="gANI">gANI = Identify and align ORFs; compare aligned ORFS</option> | |
| 130 </param> | |
| 131 <param argument="-n_PRESET" type="select" label="Presets to pass to nucmer"> | |
| 132 <option value="normal" selected="true">normal = default ANIn parameters (default: normal)</option> | |
| 133 <option value="tight">tight = only align highly conserved regions</option> | |
| 134 </param> | 89 </param> |
| 135 </when> | 90 </when> |
| 136 <when value="no"/> | 91 <when value="ignoreGenomeQuality"/> |
| 137 </conditional> | 92 </conditional> |
| 138 </xml> | 93 </xml> |
| 139 <token name="@GENOME_COMPARISON_OPTIONS@"><![CDATA[ | 94 <xml name="test_default_quality_assessment_options"> |
| 140 #if $genome_comparison.set_options == 'yes': | 95 <conditional name="quality"> |
| 141 --MASH_sketch $genome_comparison.MASH_sketch | 96 <param name="source" value="checkm"/> |
| 142 --S_algorithm $genome_comparison.S_algorithm | 97 <param name="checkM_method" value="taxonomy_wf"/> |
| 143 -n_PRESET $genome_comparison.n_PRESET | 98 <param name="checkm_group_size" value="2000"/> |
| 144 #end if | 99 </conditional> |
| 145 ]]></token> | 100 </xml> |
| 146 | 101 <token name="@QUALITY_ASSESSMENT_OPTIONS@"><![CDATA[ |
| 147 <xml name="clustering_options"> | 102 #if $quality.source == 'checkm' |
| 103 --checkM_method '$quality.checkM_method' | |
| 104 #if str($quality.set_recursion) != '' | |
| 105 --set_recurison $filter.set_recursion | |
| 106 #end if | |
| 107 --checkm_group_size $quality.checkm_group_size | |
| 108 #else if $quality.source == 'genomeInfo' | |
| 109 --genomeInfo '$quality.genomeInfo' | |
| 110 #else if $quality.source == 'ignoreGenomeQuality' | |
| 111 --ignoreGenomeQuality | |
| 112 #end if | |
| 113 ]]></token> | |
| 114 | |
| 115 <xml name="mash"> | |
| 116 <param argument="--MASH_sketch" type="integer" value="1000" min="0" label="MASH sketch size"/> | |
| 117 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary clusters"/> | |
| 118 <param argument="--multiround_primary_clustering" type='boolean' checked="false" truevalue='--multiround_primary_clustering' falsevalue='' label="Cluster each primary clunk separately and merge at the end with single linkage?" help="Decreases RAM usage and increases speed, and the cost of a minor loss in precision and the inability to plot primary_clustering_dendrograms. Especially helpful when clustering 5000+ genomes. Will be done with single linkage clustering"/> | |
| 119 <param argument="--primary_chunksize" type="integer" value="5000" min="1" label="Impacts multiround_primary_clusterings" help=" If you have more than this many genomes, process them in chunks of this size"/> | |
| 120 </xml> | |
| 121 <xml name="test_default_mash"> | |
| 122 <param name="MASH_sketch" value="1000"/> | |
| 123 <param name="P_ani" value="0.9"/> | |
| 124 <param name="multiround_primary_clustering" value=''/> | |
| 125 <param name="primary_chunksize" value="5000"/> | |
| 126 </xml> | |
| 127 <token name="@MASH@"><![CDATA[ | |
| 128 --MASH_sketch '$comp_clust.steps.MASH_sketch' | |
| 129 --P_ani $comp_clust.steps.P_ani | |
| 130 $comp_clust.steps.multiround_primary_clustering | |
| 131 --primary_chunksize $comp_clust.steps.primary_chunksize | |
| 132 ]]></token> | |
| 133 | |
| 134 <xml name="nucmer"> | |
| 135 <param argument="--n_PRESET" type="select" label="Presets to pass to nucmer"> | |
| 136 <option value="normal" selected="true">normal: default ANIn parameters</option> | |
| 137 <option value="tight">tight: only align highly conserved regions</option> | |
| 138 </param> | |
| 139 </xml> | |
| 140 <xml name="test_default_nucmer"> | |
| 141 <param name="n_PRESET" value="normal"/> | |
| 142 </xml> | |
| 143 <token name="@NUCMER@"><![CDATA[ | |
| 144 --n_PRESET '$comp_clust.steps.clustering.n_PRESET' | |
| 145 ]]></token> | |
| 146 | |
| 147 <xml name="coverage_method"> | |
| 148 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> | |
| 149 <option value="larger" selected="true">Larger = max((aligned length / genome 1), (aligned_length / genome2))</option> | |
| 150 <option value="total">Total = 2*(aligned length) / (sum of total genome lengths)</option> | |
| 151 </param> | |
| 152 </xml> | |
| 153 <xml name="test_default_coverage_method"> | |
| 154 <param name="coverage_method" value="larger"/> | |
| 155 </xml> | |
| 156 <token name="@COVERAGE_METHOD@"><![CDATA[ | |
| 157 --coverage_method '$comp_clust.steps.clustering.coverage_method' | |
| 158 ]]></token> | |
| 159 | |
| 160 <xml name="secondary_clustering"> | |
| 148 <conditional name="clustering"> | 161 <conditional name="clustering"> |
| 149 <param name="set_options" type="select" label="set clustering options"> | 162 <param argument="--S_algorithm" type="select" label="Algorithm for secondary clustering comparisons"> |
| 150 <option value="yes">Yes</option> | 163 <option value="fastANI">fastANI: Kmer-based approach - very fast</option> |
| 151 <option value="no" selected="true">No</option> | 164 <option value="ANImf" selected="true">ANImf: Align whole genomes with nucmer; filter alignment; compare aligned regions - RECOMMENDED</option> |
| 165 <option value="ANIn">ANIn: Align whole genomes with nucmer; compare aligned regions</option> | |
| 166 <option value="gANI">gANI: Identify and align ORFs; compare aligned ORFS</option> | |
| 167 <option value="goANI">Open source version of gANI; requires nsmimscan</option> | |
| 152 </param> | 168 </param> |
| 153 <when value="yes"> | 169 <when value="fastANI"> |
| 154 <param argument="--P_ani" type="float" value="0.9" min="0." max="1." label="ANI threshold to form primary (MASH) clusters"/> | 170 <param argument="--greedy_secondary_clustering" type='boolean' checked="false" truevalue='--greedy_secondary_clustering' falsevalue='' label="Use a heuristic to avoid pair-wise comparisons when doing secondary clustering?" help="Will be done with single linkage clustering"/> |
| 155 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> | 171 </when> |
| 156 | 172 <when value="ANImf"> |
| 157 <param argument="--SkipMash" type="boolean" truevalue="--SkipMash" falsevalue="" checked="false" label="Skip MASH clustering, just do secondary clustering on all genomes"/> | 173 <expand macro="nucmer"/> |
| 158 <param argument="--SkipSecondary" type="boolean" truevalue="--SkipSecondary" falsevalue="" checked="false" label="Skip secondary clustering, just perform MASH clustering"/> | 174 <expand macro="coverage_method"/> |
| 159 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> | 175 </when> |
| 160 <param argument="--coverage_method" type="select" label="Method to calculate coverage of an alignment"> | 176 <when value="ANIn"> |
| 161 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> | 177 <expand macro="nucmer"/> |
| 162 <option value="larger" selected="true">arger = max((aligned length / genome 1), (aligned_length / genome2))</option> | 178 <expand macro="coverage_method"/> |
| 163 <option value="total">total = 2*(aligned length) / (sum of total genome lengths)</option> | 179 </when> |
| 180 <when value="gANI"/> | |
| 181 <when value="goANI"/> | |
| 182 </conditional> | |
| 183 <param argument="--S_ani" type="float" value="0.99" min="0." max="1." label="ANI threshold to form secondary clusters"/> | |
| 184 <param argument="--cov_thresh" type="float" value="0.1" min="0." max="1." label="Minmum level of overlap between genomes when doing secondary comparisons"/> | |
| 185 </xml> | |
| 186 <xml name="test_default_secondary_clustering"> | |
| 187 <conditional name="clustering"> | |
| 188 <param name="S_algorithm" value="ANImf"/> | |
| 189 <expand macro="test_default_nucmer"/> | |
| 190 <expand macro="test_default_coverage_method"/> | |
| 191 </conditional> | |
| 192 <param name="S_ani" value="0.99"/> | |
| 193 <param name="cov_thresh" value="0.1"/> | |
| 194 </xml> | |
| 195 <token name="@SECONDARY_CLUSTERING@"><![CDATA[ | |
| 196 --S_algorithm '$comp_clust.steps.clustering.S_algorithm' | |
| 197 #if $comp_clust.steps.clustering.S_algorithm == 'fastANI' | |
| 198 $comp_clust.steps.clustering.greedy_secondary_clustering | |
| 199 #else if $comp_clust.steps.clustering.S_algorithm == 'ANImf' | |
| 200 @NUCMER@ | |
| 201 @COVERAGE_METHOD@ | |
| 202 #else if $comp_clust.steps.clustering.S_algorithm == 'ANIn' | |
| 203 @NUCMER@ | |
| 204 @COVERAGE_METHOD@ | |
| 205 #end if | |
| 206 --S_ani $comp_clust.steps.S_ani | |
| 207 --cov_thresh $comp_clust.steps.cov_thresh | |
| 208 ]]></token> | |
| 209 | |
| 210 <xml name="comparison_clustering_options"> | |
| 211 <section name="comp_clust" title="Genome comparison and clustering" expanded="false"> | |
| 212 <conditional name="steps"> | |
| 213 <param name="select" type="select" label="Steps in genome comparison"> | |
| 214 <option value="default" selected="true">Default: Run MASH clustering and a secondary clustering</option> | |
| 215 <option value="SkipMash">Skip MASH clustering, just do secondary clustering on all genomes</option> | |
| 216 <option value="SkipSecondary">Skip secondary clustering, just perform MASH clustering</option> | |
| 164 </param> | 217 </param> |
| 165 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes"> | 218 <when value="default"> |
| 166 <help>(passed to scipy.cluster.hierarchy.linkage)</help> | 219 <expand macro="mash"/> |
| 167 <option value="average" selected="true">average</option> | 220 <expand macro="secondary_clustering"/> |
| 168 </param> | 221 </when> |
| 169 </when> | 222 <when value="SkipMash"> |
| 170 <when value="no"/> | 223 <expand macro="secondary_clustering"/> |
| 171 </conditional> | 224 </when> |
| 172 </xml> | 225 <when value="SkipSecondary"> |
| 173 <token name="@CLUSTERING_OPTIONS@"><![CDATA[ | 226 <expand macro="mash"/> |
| 174 #if $clustering.set_options == 'yes': | 227 </when> |
| 175 --P_ani $clustering.P_ani | 228 </conditional> |
| 176 --S_ani $clustering.S_ani | 229 <param argument="--clusterAlg" type="select" label="Algorithm used to cluster genomes" help="Passed to scipy.cluster.hierarchy.linkage"> |
| 177 $clustering.SkipMash | 230 <option value="average" selected="true">average</option> |
| 178 $clustering.SkipSecondary | 231 <option value="ward">ward</option> |
| 179 --cov_thresh $clustering.cov_thresh | 232 <option value="single">single</option> |
| 180 --coverage_method $clustering.coverage_method | 233 <option value="median">median</option> |
| 181 --clusterAlg $clustering.clusterAlg | 234 <option value="centroid">centroid</option> |
| 182 #end if | 235 <option value="weighted">weighted</option> |
| 236 </param> | |
| 237 <param argument="--run_tertiary_clustering" type='boolean' checked="false" truevalue='--run_tertiary_clustering' falsevalue='' label="Run an additional round of clustering on the final genome set?" help="This is especially useful when greedy clustering is performed and/or to handle cases where similar genomes end up in different primary clusters."/> | |
| 238 </section> | |
| 239 </xml> | |
| 240 <xml name="test_default_comparison_clustering_options"> | |
| 241 <section name="comp_clust"> | |
| 242 <conditional name="steps"> | |
| 243 <param name="select" value="default" /> | |
| 244 <expand macro="test_default_mash"/> | |
| 245 <expand macro="test_default_secondary_clustering"/> | |
| 246 </conditional> | |
| 247 <param name="clusterAlg" value="average"/> | |
| 248 <param name="run_tertiary_clustering" value=''/> | |
| 249 </section> | |
| 250 </xml> | |
| 251 <token name="@COMPARISON_CLUSTERING_OPTIONS@"><![CDATA[ | |
| 252 #if $comp_clust.steps.select == 'default' | |
| 253 @MASH@ | |
| 254 @SECONDARY_CLUSTERING@ | |
| 255 #else if $comp_clust.steps.select == 'SkipMash' | |
| 256 --SkipMash | |
| 257 @SECONDARY_CLUSTERING@ | |
| 258 #else | |
| 259 @MASH@ | |
| 260 --SkipSecondary | |
| 261 #end if | |
| 262 --clusterAlg '$comp_clust.clusterAlg' | |
| 263 $comp_clust.run_tertiary_clustering | |
| 183 ]]></token> | 264 ]]></token> |
| 184 | 265 |
| 185 <xml name="scoring_options"> | 266 <xml name="scoring_options"> |
| 186 <conditional name="scoring"> | 267 <section name="scoring" title="Scoring criteria" expanded="false" help="Based off of the formula: A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) + F*(centrality - S_ani). With A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; F = cent_weight"> |
| 187 <param name="set_options" type="select" label="set scoring options"> | 268 <param argument="--completeness_weight" type="float" value="1" label="Completeness weight"/> |
| 188 <option value="yes">Yes</option> | 269 <param argument="--contamination_weight" type="float" value="5" label="Contamination weight"/> |
| 189 <option value="no" selected="true">No</option> | 270 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="Strain heterogeneity weight"/> |
| 190 </param> | 271 <param argument="--N50_weight" type="float" value=".5" label="Weight of log(genome N50)"/> |
| 191 <when value="yes"> | 272 <param argument="--size_weight" type="float" value="0" label="Weight of log(genome size)"/> |
| 192 <param argument="--completeness_weight" type="float" value="1" label="completeness weight"> | 273 <param argument="--centrality_weight" type="float" value="1" label="Weight of (centrality - S_ani)"/> |
| 193 <help> | 274 </section> |
| 194 Based off of the formula: | 275 </xml> |
| 195 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) | 276 <xml name="test_default_scoring_options"> |
| 196 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight; | 277 <section name="scoring"> |
| 197 </help> | 278 <param name="completeness_weight" value="1"/> |
| 198 </param> | 279 <param name="contamination_weight" value="5"/> |
| 199 <param argument="--contamination_weight" type="float" value="5" label="contamination weight"/> | 280 <param name="strain_heterogeneity_weight" value="1"/> |
| 200 <param argument="--strain_heterogeneity_weight" type="float" value="1" min="0." max="1." label="strain heterogeneity weight"/> | 281 <param name="N50_weight" value=".5" /> |
| 201 <param argument="--N50_weight" type="float" value=".5" label="weight of log(genome N50)"/> | 282 <param name="size_weight" value="0"/> |
| 202 <param argument="--size_weight" type="float" value="0" label="weight of log(genome size)"/> | 283 <param name="centrality_weight" value="1"/> |
| 203 </when> | 284 </section> |
| 204 <when value="no"/> | |
| 205 </conditional> | |
| 206 </xml> | 285 </xml> |
| 207 <token name="@SCORING_OPTIONS@"><![CDATA[ | 286 <token name="@SCORING_OPTIONS@"><![CDATA[ |
| 208 #if $scoring.set_options == 'yes': | 287 --completeness_weight $scoring.completeness_weight |
| 209 --completeness_weight $scoring.completeness_weight | 288 --contamination_weight $scoring.contamination_weight |
| 210 --contamination_weight $scoring.contamination_weight | 289 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight |
| 211 --strain_heterogeneity_weight $scoring.strain_heterogeneity_weight | 290 --N50_weight $scoring.N50_weight |
| 212 --N50_weight $scoring.N50_weight | 291 --size_weight $scoring.size_weight |
| 213 --size_weight $scoring.size_weight | 292 --centrality_weight $scoring.centrality_weight |
| 214 #end if | 293 ]]></token> |
| 215 ]]></token> | 294 |
| 216 | 295 <xml name="warning_options"> |
| 217 <xml name="taxonomy_options"> | 296 <section name="warning" title="Warnings" expanded="false"> |
| 218 <conditional name="taxonomy"> | 297 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/> |
| 219 <param name="set_options" type="select" label="generate taxonomy information"> | 298 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/> |
| 220 <option value="yes">Yes</option> | 299 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/> |
| 221 <option value="no" selected="true">No</option> | 300 </section> |
| 222 </param> | 301 </xml> |
| 223 <when value="yes"> | 302 <xml name="test_default_warning_options"> |
| 224 <param argument="--tax_method" type="select" label="Method of determining taxonomy"> | 303 <section name="warning"> |
| 225 <help>(for ANIn/ANImf only; gANI can only do larger method)</help> | 304 <param name="warn_dist" value="0.25"/> |
| 226 <option value="percent" selected="true">percent = The most descriptive taxonimic level with at least (per) hits</option> | 305 <param name="warn_sim" value="0.98"/> |
| 227 <option value="max">max = The centrifuge taxonomic level with the most overall hits</option> | 306 <param name="warn_aln" value="0.25"/> |
| 228 </param> | 307 </section> |
| 229 <param argument="--percent" type="float" value="50" min="0" max="100" label="minimum percent for percent method"/> | |
| 230 <param argument="--cent_index" type="data" format="" label="centrifuge index"/> | |
| 231 </when> | |
| 232 <when value="no"/> | |
| 233 </conditional> | |
| 234 </xml> | |
| 235 <token name="@TAXONOMY_OPTIONS@"><![CDATA[ | |
| 236 #if $taxonomy.set_options == 'yes': | |
| 237 --run_tax | |
| 238 --tax_method $taxonomy.tax_method | |
| 239 --percent $taxonomy.percent | |
| 240 --cent_index $taxonomy.cent_index | |
| 241 #end if | |
| 242 ]]></token> | |
| 243 | |
| 244 <xml name="warning_options"> | |
| 245 <conditional name="warning"> | |
| 246 <param name="set_options" type="select" label="set warning options"> | |
| 247 <option value="yes">Yes</option> | |
| 248 <option value="no" selected="true">No</option> | |
| 249 </param> | |
| 250 <when value="yes"> | |
| 251 <param argument="--warn_dist" type="float" value="0.25" min="0" max="1" label="How far from the threshold to throw cluster warnings"/> | |
| 252 <param argument="--warn_sim" type="float" value="0.98" min="0" max="1" label="Similarity threshold for warnings between dereplicated genomes"/> | |
| 253 <param argument="--warn_aln" type="float" value="0.25" min="0" max="1" label="Minimum aligned fraction for warnings between dereplicated genomes (ANIn)"/> | |
| 254 </when> | |
| 255 <when value="no"/> | |
| 256 </conditional> | |
| 257 </xml> | 308 </xml> |
| 258 <token name="@WARNING_OPTIONS@"><![CDATA[ | 309 <token name="@WARNING_OPTIONS@"><![CDATA[ |
| 259 #if $warning.set_options == 'yes': | 310 --warn_dist $warning.warn_dist |
| 260 --warn_dist $warning.warn_dist | 311 --warn_sim $warning.warn_sim |
| 261 --warn_sim $warning.warn_sim | 312 --warn_aln $warning.warn_aln |
| 262 --warn_aln $warning.warn_aln | |
| 263 #end if | |
| 264 ]]></token> | 313 ]]></token> |
| 265 | 314 |
| 266 <xml name="select_outputs"> | 315 <xml name="select_outputs"> |
| 267 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> | 316 <param name="select_outputs" type="select" multiple="true" optional="false" label="Select outputs"> |
| 268 <option value="log" selected="true">log</option> | 317 <option value="log" selected="true">log</option> |
| 280 <option value="Winning_genomes">Winning_genomes.pdf</option> | 329 <option value="Winning_genomes">Winning_genomes.pdf</option> |
| 281 <option value="Widb">Widb.csv</option> | 330 <option value="Widb">Widb.csv</option> |
| 282 <option value="Chdb">Chdb.tsv</option> | 331 <option value="Chdb">Chdb.tsv</option> |
| 283 </expand> | 332 </expand> |
| 284 </xml> | 333 </xml> |
| 285 | 334 <xml name="test_default_select_drep_outputs"> |
| 286 <xml name="common_outputs"> | 335 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots,Cluster_scoring,Winning_genomes,Widb" /> |
| 336 </xml> | |
| 337 <xml name="test_default_select_outputs"> | |
| 338 <param name="select_outputs" value="log,warnings,Primary_clustering_dendrogram,Clustering_scatterplots" /> | |
| 339 </xml> | |
| 340 | |
| 341 <xml name="common_outputs"> | |
| 287 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> | 342 <data name="log" format="txt" label="${tool.name} on ${on_string}: Log" from_work_dir="outdir/log/logger.log"> |
| 288 <filter>'log' in select_outputs or not select_outputs</filter> | 343 <filter>'log' in select_outputs or not select_outputs</filter> |
| 289 </data> | 344 </data> |
| 290 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> | 345 <data name="warnings" format="txt" label="${tool.name} on ${on_string}: Warnings" from_work_dir="outdir/log/warnings.txt"> |
| 291 <filter>'warnings' in select_outputs</filter> | 346 <filter>'warnings' in select_outputs</filter> |
| 301 </data> | 356 </data> |
| 302 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> | 357 <data name="Clustering_scatterplots" format="pdf" label="${tool.name} on ${on_string}: Clustering_scatterplots.pdf" from_work_dir="outdir/figures/Clustering_scatterplots.pdf"> |
| 303 <filter>'Clustering_scatterplots' in select_outputs</filter> | 358 <filter>'Clustering_scatterplots' in select_outputs</filter> |
| 304 </data> | 359 </data> |
| 305 </xml> | 360 </xml> |
| 306 | |
| 307 | |
| 308 <xml name="drep_outputs"> | 361 <xml name="drep_outputs"> |
| 309 <expand macro="common_outputs"/> | 362 <expand macro="common_outputs"/> |
| 310 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> | 363 <data name="Cluster_scoring" format="pdf" label="${tool.name} on ${on_string}: Cluster_scoring.pdf" from_work_dir="outdir/figures/Cluster_scoring.pdf"> |
| 311 <filter>'Cluster_scoring' in select_outputs</filter> | 364 <filter>'Cluster_scoring' in select_outputs</filter> |
| 312 </data> | 365 </data> |
| 318 </data> | 371 </data> |
| 319 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> | 372 <data name="Chdb" format="tabular" label="${tool.name} on ${on_string}: Chdb.tsv" from_work_dir="outdir/data/checkM/checkM_outdir/Chdb.tsv"> |
| 320 <filter>'Chdb' in select_outputs</filter> | 373 <filter>'Chdb' in select_outputs</filter> |
| 321 </data> | 374 </data> |
| 322 </xml> | 375 </xml> |
| 323 | 376 <xml name="test_string_inputs"> |
| 324 | 377 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> |
| 325 <xml name="test_defaults_log"> | 378 </xml> |
| 326 <test> | 379 <xml name="test_integer_inputs"> |
| 327 <param name="genomes" ftype="fasta" value="Enterococcus_casseliflavus_EC20.fasta,Enterococcus_faecalis_T2.fna,Enterococcus_faecalis_TX0104.fa"/> | 380 <param name="genomes" ftype="fasta" value="001,002,003"/> |
| 328 <output name="log"> | 381 </xml> |
| 329 <assert_contents> | 382 <xml name="test_log_output"> |
| 330 <yield/> | 383 <output name="log"> |
| 331 </assert_contents> | 384 <assert_contents> |
| 332 </output> | 385 <yield/> |
| 333 </test> | 386 </assert_contents> |
| 334 </xml> | 387 </output> |
| 335 | 388 </xml> |
| 336 <token name="@GENOMES_HELP@"><![CDATA[ | 389 <token name="@GENOMES_HELP@"><![CDATA[ |
| 337 I/O PARAMETERS: | 390 I/O PARAMETERS: |
| 338 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] | 391 -g [GENOMES [GENOMES ...]], --genomes [GENOMES [GENOMES ...]] |
| 339 genomes to cluster in .fasta format | 392 genomes to cluster in .fasta format |
| 340 (default: None) | 393 (default: None) |
| 341 | 394 |
| 342 | 395 |
| 343 ]]></token> | 396 ]]></token> |
| 344 | |
| 345 <token name="@FILTERING_HELP@"><![CDATA[ | 397 <token name="@FILTERING_HELP@"><![CDATA[ |
| 346 FILTERING OPTIONS: | 398 FILTERING OPTIONS: |
| 347 -l LENGTH, --length LENGTH | 399 -l LENGTH, --length LENGTH |
| 348 Minimum genome length | 400 Minimum genome length |
| 349 (default: 50000) | 401 (default: 50000) |
| 366 scoring does not work. Will only choose genomes based | 418 scoring does not work. Will only choose genomes based |
| 367 on length and N50 (default: False) | 419 on length and N50 (default: False) |
| 368 | 420 |
| 369 | 421 |
| 370 ]]></token> | 422 ]]></token> |
| 371 | |
| 372 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ | 423 <token name="@GENOME_COMPARISON_HELP@"><![CDATA[ |
| 373 GENOME COMPARISON PARAMETERS: | 424 GENOME COMPARISON PARAMETERS: |
| 374 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH | 425 -ms MASH_SKETCH, --MASH_sketch MASH_SKETCH |
| 375 MASH sketch size (default: 1000) | 426 MASH sketch size (default: 1000) |
| 376 | 427 |
| 385 Presets to pass to nucmer | 436 Presets to pass to nucmer |
| 386 tight = only align highly conserved regions | 437 tight = only align highly conserved regions |
| 387 normal = default ANIn parameters (default: normal) | 438 normal = default ANIn parameters (default: normal) |
| 388 | 439 |
| 389 ]]></token> | 440 ]]></token> |
| 390 | |
| 391 <token name="@CLUSTERING_HELP@"><![CDATA[ | 441 <token name="@CLUSTERING_HELP@"><![CDATA[ |
| 392 CLUSTERING PARAMETERS: | 442 CLUSTERING PARAMETERS: |
| 393 -pa P_ANI, --P_ani P_ANI | 443 -pa P_ANI, --P_ani P_ANI |
| 394 ANI threshold to form primary (MASH) clusters | 444 ANI threshold to form primary (MASH) clusters |
| 395 (default: 0.9) | 445 (default: 0.9) |
| 415 --clusterAlg CLUSTERALG | 465 --clusterAlg CLUSTERALG |
| 416 Algorithm used to cluster genomes (passed to | 466 Algorithm used to cluster genomes (passed to |
| 417 scipy.cluster.hierarchy.linkage (default: average) | 467 scipy.cluster.hierarchy.linkage (default: average) |
| 418 | 468 |
| 419 ]]></token> | 469 ]]></token> |
| 420 | |
| 421 <token name="@SCORING_HELP@"><![CDATA[ | 470 <token name="@SCORING_HELP@"><![CDATA[ |
| 422 SCORING CRITERIA | 471 SCORING CRITERIA |
| 423 Based off of the formula: | 472 Based off of the formula: |
| 424 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) | 473 A*Completeness - B*Contamination + C*(Contamination * (strain_heterogeneity/100)) + D*log(N50) + E*log(size) |
| 425 | 474 |
| 426 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: | 475 A = completeness_weight; B = contamination_weight; C = strain_heterogeneity_weight; D = N50_weight; E = size_weight: |
| 427 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT | 476 -comW COMPLETENESS_WEIGHT, --completeness_weight COMPLETENESS_WEIGHT |
| 428 completeness weight (default: 1) | 477 completeness weight (default: 1) |
| 435 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT | 484 -sizeW SIZE_WEIGHT, --size_weight SIZE_WEIGHT |
| 436 weight of log(genome size) (default: 0) | 485 weight of log(genome size) (default: 0) |
| 437 | 486 |
| 438 | 487 |
| 439 ]]></token> | 488 ]]></token> |
| 440 | |
| 441 <token name="@TAXONOMY_HELP@"><![CDATA[ | 489 <token name="@TAXONOMY_HELP@"><![CDATA[ |
| 442 TAXONOMY: | 490 TAXONOMY: |
| 443 --run_tax generate taxonomy information (Tdb) | 491 --run_tax generate taxonomy information (Tdb) |
| 444 (default: False) | 492 (default: False) |
| 445 | 493 |
| 459 path to centrifuge index (for example, | 507 path to centrifuge index (for example, |
| 460 /home/mattolm/download/centrifuge/indices/b+h+v | 508 /home/mattolm/download/centrifuge/indices/b+h+v |
| 461 (default: None) | 509 (default: None) |
| 462 | 510 |
| 463 ]]></token> | 511 ]]></token> |
| 464 | |
| 465 <token name="@WARNINGS_HELP@"><![CDATA[ | 512 <token name="@WARNINGS_HELP@"><![CDATA[ |
| 466 WARNINGS: | 513 WARNINGS: |
| 467 --warn_dist WARN_DIST | 514 --warn_dist WARN_DIST |
| 468 How far from the threshold to throw cluster warnings | 515 How far from the threshold to throw cluster warnings |
| 469 (default: 0.25) | 516 (default: 0.25) |
| 471 genomes (default: 0.98) | 518 genomes (default: 0.98) |
| 472 --warn_aln WARN_ALN Minimum aligned fraction for warnings between | 519 --warn_aln WARN_ALN Minimum aligned fraction for warnings between |
| 473 dereplicated genomes (ANIn) (default: 0.25) | 520 dereplicated genomes (ANIn) (default: 0.25) |
| 474 | 521 |
| 475 ]]></token> | 522 ]]></token> |
| 476 | |
| 477 | |
| 478 </macros> | 523 </macros> |
