Mercurial > repos > galaxy-australia > gtdbtk_classify_wf
changeset 1:767e50ca160e draft default tip
Deleted all files
author | galaxy-australia |
---|---|
date | Mon, 19 Dec 2022 23:13:50 +0000 |
parents | aaa924525210 |
children | |
files | gtdbtk_classify_wf.xml macros.xml test-data/genome_1.fna.gz test-data/gtdbtk_database.loc tool-data/gtdbtk_database.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 7 files changed, 0 insertions(+), 236 deletions(-) [+] |
line wrap: on
line diff
--- a/gtdbtk_classify_wf.xml Mon Dec 19 07:03:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,156 +0,0 @@ -<tool id="gtdbtk_classify_wf" name="GTDB-Tk Classify genomes" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> - <description>by placement in GTDB reference tree</description> - <macros> - <import>macros.xml</import> - </macros> - <expand macro="requirements"/> - <command detect_errors="exit_code"><![CDATA[ -#import re - -mkdir input_dir && -mkdir output_dir && -mkdir output_tsv_dir && -mkdir output_newick_dir && -mkdir output_fasta_dir && -#for $i in $input: - ## gtdbtk uses the file extension to determine the input format. - #set ext = "." + $i.ext - #set input_identifier = re.sub('[^\s\w\-]', '_', str($i.element_identifier)) + $ext - ln -s '${i}' input_dir/'${input_identifier}' && -#end for -export GTDBTK_DATA_PATH=$gtdbtk_db.fields.path && -gtdbtk classify_wf ---genome_dir input_dir ---extension '$ext' ---out_dir output_dir ---cpus \${GALAXY_SLOTS:-4} ---min_perc_aa $advanced.min_perc_aa -$advanced.force ---min_af $advanced.min_af -#if str($advanced.output_process_log) == 'yes': - && cat output_dir/gtdbtk.warnings.log output_dir/gtdbtk.log > '$process_log' -#end if - ]]></command> - <inputs> - <param name="input" type="data" format="fasta,fasta.gz" multiple="true" label="Fasta (Genome) files"/> - <param name="gtdbtk_db" type="select" label="GTDB-Tk database"> - <options from_data_table="gtdbtk_database"> - <validator type="no_options" message="No locally cached GTDB-Tk database is available"/> - </options> - </param> - <section name="advanced" title="Advanced options"> - <param argument="--min_perc_aa" type="integer" min="0" max="100" value="10" label="Exclude genomes that do not have at least this percentage of AA in the MSA" help="Inclusive bound"/> - <param argument="--force" type="boolean" truevalue="--force" falsevalue="" checked="false" label="Continue processing if an error occurs on a single genome?"/> - <param argument="--min_af" type="float" min="0" max="1" value="0.65" label="Minimum alignment fraction to consider closest genome"/> - <param name="output_process_log" type="boolean" truevalue="yes" falsevalue="no" checked="false" label="Output process log file?"/> - </section> - </inputs> - <outputs> - <data name="process_log" format="txt" label="${tool.name} on ${on_string} (process log)"> - <filter>advanced['output_process_log'] == 'yes'</filter> - </data> - <collection name="output_tsv" type="list" format="tsv" label="${tool.name} on ${on_string} (tsv)"> - <discover_datasets pattern="(?P<designation>.+)\.tsv" ext="tsv" directory="output_dir"/> - </collection> - <collection name="output_newick" type="list" format="newick" label="${tool.name} on ${on_string} (newick)"> - <discover_datasets pattern="(?P<designation>.+)\.tree" ext="newick" directory="output_dir"/> - </collection> - <collection name="output_fasta" type="list" format="fasta" label="${tool.name} on ${on_string} (fasta)"> - <discover_datasets pattern="(?P<designation>.+)\.fasta" ext="fasta" directory="output_dir"/> - </collection> - </outputs> - <tests> - <!-- The commented test here is valid if we could store the GTDB-Tk database --> - <!-- - <test expect_num_outputs="3"> - <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> - <param name="gtdbtk_db" value="gtdbtk202"/> - <output_collection name="output_tsv" type="list" count="6"> - <element name="gtdbtk.ar122.filtered" ftype="tsv"> - <assert_contents> - <has_size value="0"/> - </assert_contents> - </element> - <element name="gtdbtk.ar122.markers_summary" ftype="tsv"> - <assert_contents> - <has_text text="number_unique_genes"/> - </assert_contents> - </element> - <element name="gtdbtk.ar122.summary" ftype="tsv"> - <assert_contents> - <has_text text="genome_1_fna_gz"/> - </assert_contents> - </element> - <element name="gtdbtk.bac120.markers_summary" ftype="tsv"> - <assert_contents> - <has_text text="genome_1_fna_gz"/> - </assert_contents> - </element> - <element name="gtdbtk.failed_genomes" ftype="tsv"> - <assert_contents> - <has_size value="0"/> - </assert_contents> - </element> - <element name="gtdbtk.translation_table_summary" ftype="tsv"> - <assert_contents> - <has_text text="genome_1_fna_gz"/> - </assert_contents> - </element> - </output_collection> - <output_collection name="output_newick" type="list" count="1"> - <element name="gtdbtk.ar122.classify" ftype="newick"> - <assert_contents> - <has_text text="GB_GCA_"/> - </assert_contents> - </element> - </output_collection> - <output_collection name="output_fasta" type="list" count="2"> - <element name="gtdbtk.ar122.msa" ftype="fasta"> - <assert_contents> - <has_text text="GB_GCA_000008085"/> - </assert_contents> - </element> - <element name="gtdbtk.ar122.user_msa" ftype="fasta"> - <assert_contents> - <has_text text="genome_1_fna_gz"/> - </assert_contents> - </element> - </output_collection> - </test> - --> - <!-- GTDB-Tk databases are far too large to test currently --> - <test expect_failure="true"> - <param name="input" value="genome_1.fna.gz" ftype="fasta.gz"/> - <param name="gtdbtk_db" value="gtdbtk202"/> - <assert_stderr> - <has_text text="Fatal error: Exit code 1"/> - </assert_stderr> - </test> - </tests> - <help><![CDATA[ -**What it does** - -GTDB-Tk is a software toolkit for assigning objective taxonomic classifications to bacterial and archaeal genomes -based on the Genome Database Taxonomy GTDB. It is designed to work with recent advances that allow hundreds or -thousands of metagenome-assembled genomes (MAGs) to be obtained directly from environmental samples. It can also -be applied to isolate and single-cell genomes. - -This tool accepts one or more fasta (genome) files and determines taxonomic classification of genomes by -maximum-likelihood (ML) placement. The classification workflow consists of three steps: identify, align, and -classify. - -The identify step calls genes using Prodigal, and uses HMM models and the HMMER package to identify the 120 bacterial -and 122 archaeal marker genes used for phylogenetic inference. Multiple sequence alignments (MSA) are obtained by -aligning marker genes to their respective HMM model. - -The align step concatenates the aligned marker genes and filters the concatenated MSA to approximately 5,000 -amino acids. - -Finally, the classify step uses pplacer to find the maximum-likelihood placement of each genome in the GTDB-Tk -reference tree. GTDB-Tk classifies each genome based on its placement in the reference tree, its relative evolutionary -divergence, and/or average nucleotide identity (ANI) to reference genomes. - -Results can be impacted by a lack of marker genes or contamination. - ]]></help> - <expand macro="citations"/> -</tool>
--- a/macros.xml Mon Dec 19 07:03:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,15 +0,0 @@ -<macros> - <token name="@TOOL_VERSION@">2.1.1</token> - <token name="@VERSION_SUFFIX@">0</token> - <token name="@PROFILE@">20.09</token> - <xml name="requirements"> - <requirements> - <requirement type="package" version="@TOOL_VERSION@">gtdbtk</requirement> - </requirements> - </xml> - <xml name="citations"> - <citations> - <citation type="doi">10.1093/bioinformatics/btz848</citation> - </citations> - </xml> -</macros>
--- a/test-data/gtdbtk_database.loc Mon Dec 19 07:03:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,26 +0,0 @@ -# This is a sample file distributed with Galaxy that enables tools -# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc -# file has this format (longer white space characters are TAB characters): -# -# <unique_build_id> <display_name> <directory_path> -# -# So, for example, if you have the gtdbtk 202 stored in -# /depot/data2/galaxy/gtdbtk/202/, -# then the gtdbtk_databases.loc entry would look like this: -# -# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 -# -# and your /depot/data2/galaxy/gtdbtk/release202 directory -# would contain GTDB-Tk database files for release 202, sommething like this: -# -#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ -#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/ -gtdbtk202 GTDB-Tk database v202 ${__HERE__}/gtdbtk202
--- a/tool-data/gtdbtk_database.loc.sample Mon Dec 19 07:03:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -# This is a sample file distributed with Galaxy that enables tools -# to use a directory of GTDB-Tk databases. The gtdbtk_databases.loc -# file has this format (longer white space characters are TAB characters): -# -# <unique_build_id> <display_name> <directory_path> -# -# So, for example, if you have the gtdbtk 202 stored in -# /depot/data2/galaxy/gtdbtk/202/, -# then the gtdbtk_databases.loc entry would look like this: -# -# release202 gtdbtk database release 202 /depot/data2/galaxy/gtdbtk/release202 -# -# and your /depot/data2/galaxy/gtdbtk/release202 directory -# would contain GTDB-Tk database files for release 202, sommething like this: -# -#drwxr-sr-x 3 gvk G-824019 4096 Apr 20 2021 fastani/ -#-rw-r--r-- 1 gvk G-824019 4810764 Apr 22 2021 manifest.tsv -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 markers/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 masks/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 metadata/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 21 2021 mrca_red/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 msa/ -#drwxr-sr-x 4 gvk G-824019 4096 Apr 21 2021 pplacer/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 radii/ -#drwxr-sr-x 2 gvk G-824019 4096 Apr 20 2021 taxonomy/
--- a/tool_data_table_conf.xml.sample Mon Dec 19 07:03:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -<tables> - <!-- Locations of GTDB-Tk database versions 202 and higher --> - <table name="gtdbtk_database" comment_char="#"> - <columns>value, name, path</columns> - <file path="tool-data/gtdbtk_database.loc" /> - </table> -</tables>
--- a/tool_data_table_conf.xml.test Mon Dec 19 07:03:08 2022 +0000 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,7 +0,0 @@ -<tables> - <!-- Location of databases for gtdbtk version 202 and higher --> - <table name="gtdbtk_database" comment_char="#"> - <columns>value, name, path</columns> - <file path="${__HERE__}/test-data/gtdbtk_database.loc" /> - </table> -</tables>