Mercurial > repos > pimarin > bakta
diff bakta.xml @ 3:eea334d9988b draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/blob/master/tools/bakta commit 73af464cc860250c3fa3dd433602283ab5a44f53-dirty
author | pimarin |
---|---|
date | Thu, 22 Dec 2022 15:01:43 +0000 |
parents | ca9e2125c5de |
children | 591cae6ef29d |
line wrap: on
line diff
--- a/bakta.xml Wed Aug 17 10:29:37 2022 +0000 +++ b/bakta.xml Thu Dec 22 15:01:43 2022 +0000 @@ -1,8 +1,6 @@ -<?xml version="1.0" encoding="UTF-8"?> - -<tool id="bakta" name="Bakta genome annotation" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> +<tool id="bakta" name="Bakta" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> <description> - Bakta: rapid and standardized annotation of bacterial genomes via alignment-free sequence identification + genome annotation via alignment-free sequence identification </description> <macros> <import>macro.xml</import> @@ -13,6 +11,10 @@ <expand macro="version_command"/> <command detect_errors="aggressive"><![CDATA[ + mkdir ./database_path && + ln -s '$(input_option.bakta_db_select.fields.path)/'* database_path && + ln -s '$(input_option.amrfinder_db_select.fields.path)' database_path && + bakta #*====================================== CPU option @@ -21,7 +23,7 @@ #*====================================== Bakta database ======================================*# - --db $input_option.db_select.fields.path + --db ./database_path #if $input_option.min_contig_length --min-contig-length $input_option.min_contig_length #else if $annotation.compliant @@ -72,16 +74,9 @@ Workflow OPTIONS skip some step of the bakta analysis ======================================*# - $workflow.skip_trna - $workflow.skip_tmrna - $workflow.skip_rrna - $workflow.skip_ncrna - $workflow.skip_ncrna_region - $workflow.skip_crispr - $workflow.skip_cds - $workflow.skip_sorf - $workflow.skip_gap - $workflow.skip_ori + + #echo " ".join($workflow.skip_analysis) + #*====================================== Genome file ======================================*# @@ -89,29 +84,41 @@ #*====================================== LOG file ======================================*# - &> '$logfile' + | tee '$logfile' ]]></command> <inputs> <!-- DB and file INPUT --> <section name="input_option" title="Input/Output options" expanded="true"> - <param name="db_select" type="select" label="The bakta database"> + <param name="bakta_db_select" type="select" label="The bakta database"> <options from_data_table="bakta_database"> - <validator message="No bakta database is available" type="no_options"/> + <filter type="static_value" value="@BAKTA_VERSION@" column="bakta_version"/> + <column name="dbkey" index="2"/> + <validator message="No bakta database is available" type="no_options"/> </options> </param> + <param name="amrfinder_db_select" type="select" label="The amrfinderplus database"> + <options from_data_table="amrfinderplus_database"> + <validator message="No amrfinderplus database is available" type="no_options"/> + </options> + </param> + <param name="input_file" type="data" format="fasta,fasta.gz" label="Select genome in fasta format"/> <param name="min_contig_length" type="integer" optional="true" min="0" label="Minimum contig size" help="Minimum contig size (default = 1; 200 in compliant mode) (--min-contig-length)"/> </section> <!-- Organism INFORMATION OPTIONS --> <section name="organism" title="Optional organism options" expanded="false"> <param argument="--genus" type="text" optional="true" label="Specify genus name" help="ex. Escherichia"> - <validator type="regex">^[A-Z]</validator> + <validator type="regex">^[a-zA-Z]+$</validator> </param> - <param argument="--species" type="text" optional="true" label="Specify species name" help="ex. 'coli O157:H7'"/> + <param argument="--species" type="text" optional="true" label="Specify species name" help="ex. 'coli O157:H7'"> + <validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator> + </param> <param argument="--strain" type="text" optional="true" label="Specify strain name" help="ex. Sakai"> - <validator type="regex">^[A-Z]</validator> + <validator type="regex">^[a-zA-Z]+$</validator> </param> - <param argument="--plasmid" type="text" optional="true" label="Specify plasmid name" help="ex. pOSAK1"/> + <param argument="--plasmid" type="text" optional="true" label="Specify plasmid name" help="ex. pOSAK1"> + <validator type="regex">^[a-zA-Z0-9\s(:\-/)]+$</validator> + </param> </section> <!-- ANNOTATION --> <section name="annotation" title="Optional annotation"> @@ -127,357 +134,298 @@ <option value="?" selected="true">Unknown</option> </param> <param name="keep_contig_headers" type="boolean" truevalue="--keep-contig-headers" falsevalue="" label="Keep original contig header (--keep-contig-headers)"/> - <param argument="--replicons" type="data" format="tsv, csv" optional="true" label="Replicon information table (tsv/csv)" help=""/> + <param argument="--replicons" type="data" format="tsv,csv" optional="true" label="Replicon information table (tsv/csv)" help=""/> <param argument="--compliant" type="boolean" truevalue="--compliant" falsevalue="" label="Force Genbank/ENA/DDJB compliance"/> <param argument="--proteins" type="data" format="fasta" optional="true" label="Protein fasta file" help="Fasta file of trusted protein sequences for CDS annotation"/> </section> <!-- PARAMETER FOR WORKFLOW ANALYSIS --> <section name="workflow" title="Workflow option to skip steps"> - <param name="skip_trna" type="boolean" truevalue="--skip-trna" falsevalue="" label="Skip tRNA detection and annotation" help="(--skip-trna)"/> - <param name="skip_tmrna" type="boolean" truevalue="--skip-tmrna" falsevalue="" label="Skip tmRNA detection and annotation" help="(--skip-tmrna)"/> - <param name="skip_rrna" type="boolean" truevalue="--skip-rrna" falsevalue="" label=" Skip rRNA detection and annotation" help="(--skip-rrna)"/> - <param name="skip_ncrna" type="boolean" truevalue="--skip-ncrna" falsevalue="" label=" Skip ncRNA detection and annotation" help="(--skip-ncrna)"/> - <param name="skip_ncrna_region" type="boolean" truevalue="--skip-ncrna-region" falsevalue="" label="Skip ncRNA region detection and annotation" help="(--skip-ncrna-region)"/> - <param name="skip_crispr" type="boolean" truevalue="--skip-crispr" falsevalue="" label="Skip CRISPR array detection and annotation" help="(--skip-crispr)"/> - <param name="skip_cds" type="boolean" truevalue="--skip-cds" falsevalue="" label="Skip CDS detection and annotation" help="(--skip-cds)"/> - <param name="skip_sorf" type="boolean" truevalue="--skip-sorf" falsevalue="" label="Skip sORF detection and annotation" help="(--skip-sorf)"/> - <param name="skip_gap" type="boolean" truevalue="--skip-gap" falsevalue="" label="Skip gap detection and annotation" help="(--skip-gap)"/> - <param name="skip_ori" type="boolean" truevalue="--skip-ori" falsevalue="" label="Skip oriC/oriT detection and annotation" help="(--skip_ori)"/> + <param name="skip_analysis" type="select" display="checkboxes" multiple="true" label="Select steps to skip"> + <option value="--skip-trna"> Skip tRNA detection and annotation </option> + <option value="--skip-tmrna"> Skip tmRNA detection and annotation </option> + <option value="--skip-rrna"> Skip rRNA detection and annotation </option> + <option value="--skip-ncrna"> Skip ncRNA detection and annotation </option> + <option value="--skip-ncrna-region"> Skip ncRNA region detection and annotation </option> + <option value="--skip-crispr"> Skip CRISPR array detection and annotation </option> + <option value="--skip-cds"> Skip CDS detection and annotation </option> + <option value="--skip-pseudo"> Skip pseudogene detection and annotation </option> + <option value="--skip-sorf"> Skip sORF detection and annotation </option> + <option value="--skip-gap"> Skip gap detection and annotation </option> + <option value="--skip-ori"> Skip oriC/oriT detection and annotation </option> + </param> </section> + <section name="output_files" title="Selection of the output files"> + <param name="output_selection" type="select" display="checkboxes" multiple="true" label="Output files selection"> + <option value="file_tsv" selected="true"> Annotation file in TSV </option> + <option value="file_gff3" selected="true"> Annotation and sequence in GFF3 </option> + <option value="file_gbff" selected="false"> Annotations and sequences in GenBank format </option> + <option value="file_embl" selected="false"> Annotations and sequences in EMBL format </option> + <option value="file_fna" selected="false"> Replicon/contig DNA sequences as FASTA </option> + <option value="file_ffn" selected="true"> Feature nucleotide sequences as FASTA </option> + <option value="file_faa" selected="false"> CDS/sORF amino acid sequences as FASTA </option> + <option value="hypo_tsv" selected="false"> Hypothetical protein CDS in TSV</option> + <option value="hypo_fa" selected="false"> Hypothetical protein CDS amino sequences as FASTA</option> + <option value="sum_txt" selected="false"> Summary as TXT</option> + <option value="file_json" selected="false"> Information on each annotated feature as JSON </option> + <option value="file_plot" selected="true"> Plot of the annotation result as SVG </option> + <option value="log_txt" selected="false"> Log file as TXT </option> + </param> + </section> + </inputs> <outputs> - <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file"/> - <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output.tsv" label="${tool.name} on ${on_string}: bakta_output.tsv"/> - <data name="annotation_gff3" format="tabular" from_work_dir="bakta_output.gff3" label="${tool.name} on ${on_string}: bakta_output.gff3"/> - <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff"/> - <data name="annotation_embl" format="tabular" from_work_dir="bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl"/> - <data name="annotation_fna" format="fasta" from_work_dir="bakta_output.fna" label="${tool.name} on ${on_string}: bakta_output.fna"/> - <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output.ffn" label="${tool.name} on ${on_string}: bakta_output.ffn"/> - <data name="annotation_faa" format="fasta" from_work_dir="bakta_output.faa" label="${tool.name} on ${on_string}: bakta_output.faa"/> - <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: bakta_output.hypotheticals.tsv"> - <filter>workflow['skip_cds'] == False</filter> + <data name="annotation_tsv" format="tabular" from_work_dir="bakta_output.tsv" label="${tool.name} on ${on_string}: annotation_summary"> + <filter> output_files['output_selection'] and "file_tsv" in output_files['output_selection'] </filter> + </data> + <data name="annotation_gff3" format="gff3" from_work_dir="bakta_output.gff3" label="${tool.name} on ${on_string}: Annotation_and_sequences"> + <filter> output_files['output_selection'] and "file_gff3" in output_files['output_selection'] </filter> + </data> + <data name="annotation_gbff" format="tabular" from_work_dir="bakta_output.gbff" label="${tool.name} on ${on_string}: bakta_output.gbff"> + <filter> output_files['output_selection'] and "file_gbff" in output_files['output_selection'] </filter> + </data> + <data name="annotation_embl" format="tabular" from_work_dir="bakta_output.embl" label="${tool.name} on ${on_string}: bakta_output.embl"> + <filter> output_files['output_selection'] and "file_embl" in output_files['output_selection'] </filter> + </data> + <data name="annotation_fna" format="fasta" from_work_dir="bakta_output.fna" label="${tool.name} on ${on_string}: Contig_sequences"> + <filter> output_files['output_selection'] and "file_fna" in output_files['output_selection'] </filter> + </data> + <data name="annotation_ffn" format="fasta" from_work_dir="bakta_output.ffn" label="${tool.name} on ${on_string}: Nucleotide_sequences"> + <filter> output_files['output_selection'] and "file_ffn" in output_files['output_selection'] </filter> </data> - <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: bakta_output.hypotheticals.faa"> - <filter>workflow['skip_cds'] == False</filter> + <data name="annotation_faa" format="fasta" from_work_dir="bakta_output.faa" label="${tool.name} on ${on_string}: Amino_acid_sequences"> + <filter> output_files['output_selection'] and "file_faa" in output_files['output_selection'] </filter> + </data> + <data name="hypotheticals_tsv" format="tabular" from_work_dir="bakta_output.hypotheticals.tsv" label="${tool.name} on ${on_string}: hypothetical_annotation_summary"> + <filter> output_files['output_selection'] and "hypo_tsv" in output_files['output_selection'] </filter> + </data> + <data name="hypotheticals_faa" format="fasta" from_work_dir="bakta_output.hypotheticals.faa" label="${tool.name} on ${on_string}: hypothetical_amino_acid_sequences"> + <filter> output_files['output_selection'] and "hypo_fa" in output_files['output_selection'] </filter> </data> - <data name="summary_txt" format="txt" from_work_dir="bakta_output.txt" label="${tool.name} on ${on_string}: bakta_output.txt"/> - <data name="annotation_json" format="json" from_work_dir="bakta_output.json" label="${tool.name} on ${on_string}: bakta_output.json"/> + <data name="summary_txt" format="txt" from_work_dir="bakta_output.txt" label="${tool.name} on ${on_string}: Analysis_summary"> + <filter> output_files['output_selection'] and "sum_txt" in output_files['output_selection'] </filter> + </data> + <data name="annotation_json" format="json" from_work_dir="bakta_output.json" label="${tool.name} on ${on_string}: annotation_machine_readable"> + <filter> output_files['output_selection'] and "file_json" in output_files['output_selection'] </filter> + </data> + <data name="annotation_plot" format="svg" from_work_dir="bakta_output.svg" label="${tool.name} on ${on_string}: Plot of the annotation"> + <filter> output_files['output_selection'] and "file_plot" in output_files['output_selection'] </filter> + </data> + <data name="logfile" format="txt" label="${tool.name} on ${on_string}: log file"> + <filter> output_files['output_selection'] and "log_txt" in output_files['output_selection'] </filter> + </data> </outputs> - <tests> - <test expect_num_outputs="12"> <!-- TEST_1 database + input --> - <section name="input_option" > - <param name="db_select" value="test-db-bakta"/> - <param name="input_file" value="NC_002127.1.fna"/> - </section> - <output name="logfile" value="TEST_1/TEST_1.log" lines_diff="4"> - <assert_contents> - <has_text_matching n="1" expression="Genome size: 3,306 bp"/> - <has_n_lines n="90" delta="1"/> - </assert_contents> - </output> - <output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"> - <assert_contents> - <has_text_matching n="3" expression="contig_1"/> - <has_n_lines n="6" delta="1"/> - </assert_contents> - </output> - <output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"> - <assert_contents> - <has_text_matching expression="AGCTATTCCTGGTTTCATATGAAACAAACCATGCCTGTTCTCATGCCAGTAAGTGTAGCA"/> - <has_n_lines n="70" delta="1"/> - </assert_contents> - </output> - <output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="4"> - <assert_contents> - <has_text_matching expression="SSASSCSFSHMVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL"/> - <has_n_lines n="133" delta="1"/> - </assert_contents> - </output> - <output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="2"> - <assert_contents> - <has_text_matching expression="FIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTFQSDNTVTAKKQQCFVGNSNLQTGQ"/> - <has_n_lines n="137" delta="2"/> - </assert_contents> - </output> - <output name="annotation_fna" value="TEST_1/TEST_1.fna"> - <assert_contents> - <has_text_matching expression="TTCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGC"/> - <has_n_lines n="57"/> - </assert_contents> - </output> - <output name="annotation_ffn" value="TEST_1/TEST_1.ffn"> - <assert_contents> - <has_text_matching expression="TCTTCTGCGAGTTCGTGCAGCTTCTCACACATGGTGGCCTGCTCGTCAGCATCGAGTGCGTCCAGTTTTTCGAGC"/> - <has_n_lines n="6"/> - </assert_contents> - </output> - <output name="annotation_faa" value="TEST_1/TEST_1.faa"> - <assert_contents> - <has_text_matching expression="MKKDKKYQIEAIKNKDKTLFIVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSFNGEKFSSYTLNKVTKTDEYNDLSELSASFFKKNFDKINVNLLSKATSFALKKGIPI"/> - <has_n_lines n="6"/> - </assert_contents> - </output> - <output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv"> - <assert_contents> - <has_text_matching expression="DOGAIA_00010"/> - <has_n_lines n="6"/> - </assert_contents> - </output> - <output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"> - <assert_contents> - <has_text_matching expression="SSASSCSFSHMVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCLFIFLFSPFCLSSASCDYIAHHFSTVLPPVFCRRTF"/> - <has_n_lines n="6"/> - </assert_contents> - </output> - <output name="summary_txt" value="TEST_1/TEST_1.txt"> - <assert_contents> - <has_text_matching expression="N50: 3306"/> - <has_n_lines n="29"/> - </assert_contents> + <test expect_num_outputs="13"> <!-- TEST_1 database + input --> + <section name="input_option" > + <param name="bakta_db_select" value="V0.1_2022-08-29"/> + <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> + <param name="input_file" value="NC_002127.1.fna"/> + <param name="min_contig_length" value="250"/> + </section> + <section name="output_files"> + <param name="output_selection" value="file_tsv,file_gff3,file_gbff,file_embl,file_fna,file_ffn,file_faa,hypo_tsv,hypo_fa,sum_txt,file_json,file_plot,log_txt"/> + </section> + <output name="annotation_tsv" value="TEST_1/TEST_1.tsv" lines_diff="2"/> + <output name="annotation_gff3" value="TEST_1/TEST_1.gff3" lines_diff="2"/> + <output name="annotation_gbff" value="TEST_1/TEST_1.gbff" lines_diff="8"/> + <output name="annotation_embl" value="TEST_1/TEST_1.embl" lines_diff="6"/> + <output name="annotation_fna" value="TEST_1/TEST_1.fna"/> + <output name="annotation_ffn" value="TEST_1/TEST_1.ffn"/> + <output name="annotation_faa" value="TEST_1/TEST_1.faa"/> + <output name="hypotheticals_tsv" value="TEST_1/TEST_1.hypotheticals.tsv" lines_diff="4"/> + <output name="hypotheticals_faa" value="TEST_1/TEST_1.hypotheticals.faa"/> + <output name="summary_txt" value="TEST_1/TEST_1.txt" lines_diff="4"/> + <output name="annotation_plot"> + <assert_contents> + <has_size value="418991" delta="1000"/> + </assert_contents> </output> - <output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="4"> - <assert_contents> - <has_text_matching expression="0.6524500907441017"/> - <has_n_lines n="112" delta="1"/> - </assert_contents> - </output> - </test> - <test expect_num_outputs="12"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps --> - <section name="input_option" > - <param name="db_select" value="test-db-bakta"/> - <param name="input_file" value="NC_002127.1.fna"/> - <param name="min_contig_length" value="250"/> - </section> - <section name="organism"> - <param name="genus" value="Escherichia"/> - <param name="species" value="coli O157:H7"/> - <param name="strain" value="Sakai"/> - <param name="plasmid" value="pOSAK1"/> - </section> - <section name="annotation"> - <param name="--gram" value="-"/> - <param name="keep_contig_headers" value="true"/> - </section> - <section name="workflow"> - <param name="skip_crispr" value="true"/> - <param name="skip_gap" value="true"/> - </section> - <output name="logfile" value="TEST_2/TEST_2.log" lines_diff="4"> - <assert_contents> - <has_text_matching expression="Genome size: 3,306 bp"/> - </assert_contents> - </output> - <output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="2"> - <assert_contents> - <has_text_matching expression="DOGAIA_00005"/> - </assert_contents> - </output> - <output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="2"> - <assert_contents> - <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/> - </assert_contents> - </output> - <output name="annotation_gbff" value="TEST_2/TEST_2.gbff" lines_diff="5"> - <assert_contents> - <has_text_matching expression="SSASSCSFSHMVACSSASSASSFSSSV"/> - </assert_contents> - </output> - <output name="annotation_embl" value="TEST_2/TEST_2.embl" lines_diff="4"> - <assert_contents> - <has_text_matching expression="MKKDKKYQIEAIKNKDKTLFIVYATDIYSPSEFFSKIESDLKKKK"/> - </assert_contents> - </output> - <output name="annotation_fna" value="TEST_2/TEST_2.fna"/> - <output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/> - <output name="annotation_faa" value="TEST_2/TEST_2.faa"/> - <output name="hypotheticals_tsv" value="TEST_2/TEST_2.hypotheticals.tsv"/> - <output name="hypotheticals_faa" value="TEST_2/TEST_2.hypotheticals.faa"/> - <output name="summary_txt" value="TEST_2/TEST_2.txt"> - <assert_contents> - <has_text_matching expression="N50: 3306"/> - </assert_contents> - </output> - <output name="annotation_json" value="TEST_2/TEST_2.json" lines_diff="4"> - <assert_contents> - <has_text_matching expression="0.6524500907441017"/> - </assert_contents> - </output> - </test> - <test expect_num_outputs="10"> <!-- TEST_3 test all skip steps --> - <section name="input_option" > - <param name="db_select" value="test-db-bakta"/> - <param name="input_file" value="NC_002127.1.fna"/> - <param name="min_contig_length" value="250"/> - </section> - <section name="workflow"> - <param name="skip_trna" value="true"/> - <param name="skip_tmrna" value="true"/> - <param name="skip_rrna" value="true"/> - <param name="skip_ncrna" value="true"/> - <param name="skip_ncrna_region" value="true"/> - <param name="skip_crispr" value="true"/> - <param name="skip_cds" value="true"/> - <param name="skip_sorf" value="true"/> - <param name="skip_gap" value="true"/> - <param name="skip_ori" value="true"/> - </section> - <output name="logfile" value="TEST_3/TEST_3.log" lines_diff="4"> - <assert_contents> - <has_text_matching expression="Genome size: 3,306 bp"/> - </assert_contents> - </output> - <output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="1"> - <assert_contents> - <has_n_lines n="3" delta="1"/> - </assert_contents> - </output> - <output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="2"> - <assert_contents> - <has_n_lines n="67" delta="1"/> - </assert_contents> - </output> - <output name="annotation_gbff" value="TEST_3/TEST_3.gbff" lines_diff="10"/> - <output name="annotation_embl" value="TEST_3/TEST_3.embl" lines_diff="4"/> - <output name="annotation_fna" value="TEST_3/TEST_3.fna"/> - <output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/> - <output name="annotation_faa" value="TEST_3/TEST_3.faa"/> - <output name="summary_txt" value="TEST_3/TEST_3.txt"> - <assert_contents> - <has_text_matching expression="GC: 43.4"/> - </assert_contents> - </output> - <output name="annotation_json" value="TEST_3/TEST_3.json" lines_diff="4"/> + + <output name="annotation_json" value="TEST_1/TEST_1.json" lines_diff="6"/> + <output name="logfile" value="TEST_1/TEST_1.log" lines_diff="6"/> </test> - <test expect_num_outputs="12"> <!-- TEST_4 annotations --> + <test expect_num_outputs="4"> <!-- TEST_2 another input, add organism info some annotations and skip 2 steps --> <section name="input_option" > - <param name="db_select" value="test-db-bakta"/> + <param name="bakta_db_select" value="V0.1_2022-08-29"/> + <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> <param name="input_file" value="NC_002127.1.fna"/> + <param name="min_contig_length" value="250"/> + </section> + <section name="organism"> + <param name="genus" value="Escherichia"/> + <param name="species" value="coli O157:H7"/> + <param name="strain" value="Sakai"/> + <param name="plasmid" value="pOSAK1"/> </section> <section name="annotation"> - <param name="complete" value="true"/> - <param name="translation_table" value="4"/> - <param name="prodigal" value="prodigal.tf"/> - <param name="replicons" value="replicons.tsv"/> - <param name="compliant" value="true"/> - <param name="proteins" value="user-proteins.faa"/> + <param name="--gram" value="-"/> + <param name="keep_contig_headers" value="true"/> </section> - <output name="logfile" value="TEST_4/TEST_4.log" lines_diff="4"> + <section name="workflow"> + <param name="skip_analysis" value="--skip-trna,--skip-tmrna"/> + </section> + <output name="annotation_tsv" value="TEST_2/TEST_2.tsv" lines_diff="4"> <assert_contents> - <has_text_matching expression="Genome size: 3,306 bp"/> - </assert_contents> - </output> - <output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="2"> - <assert_contents> - <has_text_matching expression="mock1"/> - </assert_contents> - </output> - <output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="2"> - <assert_contents> - <has_text_matching expression="ID=DOGAIA_00005_gene;locus_tag=DOGAIA_00005"/> + <has_text_matching expression="IHHALP_00005"/> </assert_contents> </output> - <output name="annotation_gbff" value="TEST_4/TEST_4.gbff" lines_diff="4"> + <output name="annotation_gff3" value="TEST_2/TEST_2.gff3" lines_diff="4"> <assert_contents> - <has_text_matching expression="SSASSCSFSHMVACSSASSASSFSSSVRLWLFMNPAMLSAVCCCL"/> - </assert_contents> - </output> - <output name="annotation_embl" value="TEST_4/TEST_4.embl" lines_diff="4"> - <assert_contents> - <has_text_matching expression="MKKDKKYQIEAIKNKDKTLFIVYATDIYSPSEFFSKIESDLKKKK"/> + <has_text_matching expression="ID=NC_002127.1;Name=NC_002127.1;Is_circular=true"/> </assert_contents> </output> - <output name="annotation_fna" value="TEST_4/TEST_4.fna"/> - <output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/> - <output name="annotation_faa" value="TEST_4/TEST_4.faa"/> - <output name="hypotheticals_tsv" value="TEST_4/TEST_4.hypotheticals.tsv"/> - <output name="hypotheticals_faa" value="TEST_4/TEST_4.hypotheticals.faa"/> - <output name="summary_txt" value="TEST_4/TEST_4.txt"> + <output name="annotation_ffn" value="TEST_2/TEST_2.ffn"/> + <output name="annotation_plot"> <assert_contents> - <has_text_matching expression="CDSs: 3"/> - </assert_contents> - </output> - <output name="annotation_json" value="TEST_4/TEST_4.json" lines_diff="4"> - <assert_contents> - <has_text_matching expression="0.4340592861464005"/> + <has_size value="418991" delta="1000"/> </assert_contents> </output> </test> + <test expect_num_outputs="4"> <!-- TEST_3 test all skip steps --> + <section name="input_option" > + <param name="bakta_db_select" value="V0.1_2022-08-29"/> + <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> + <param name="input_file" value="NC_002127.1.fna"/> + <param name="min_contig_length" value="350"/> + </section> + <section name="workflow"> + <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori"/> + </section> + <output name="annotation_tsv" value="TEST_3/TEST_3.tsv" lines_diff="4"/> + <output name="annotation_gff3" value="TEST_3/TEST_3.gff3" lines_diff="4"/> + <output name="annotation_ffn" value="TEST_3/TEST_3.ffn"/> + <output name="annotation_plot"> + <assert_contents> + <has_size value="418399" delta="1000"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="4"> <!-- TEST_4 annotations --> + <section name="input_option" > + <param name="bakta_db_select" value="V0.1_2022-08-29"/> + <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> + <param name="input_file" value="NC_002127.1.fna"/> + </section> + <section name="annotation"> + <param name="complete" value="true"/> + <param name="prodigal" value="prodigal.tf"/> + <param name="translation_table" value="4"/> + <param name="replicons" value="replicons.tsv"/> + <param name="compliant" value="true"/> + <param name="proteins" value="user-proteins.faa"/> + </section> + <output name="annotation_tsv" value="TEST_4/TEST_4.tsv" lines_diff="4"/> + <output name="annotation_gff3" value="TEST_4/TEST_4.gff3" lines_diff="4"/> + <output name="annotation_ffn" value="TEST_4/TEST_4.ffn"/> + <output name="annotation_plot"> + <assert_contents> + <has_size value="418399" delta="1000"/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> <!-- TEST_5 skip all steps and keep only the logfile and summary --> + <section name="input_option" > + <param name="bakta_db_select" value="V0.1_2022-08-29"/> + <param name="amrfinder_db_select" value="V3.6-2020-03-20.1"/> + <param name="input_file" value="NC_002127.1.fna"/> + </section> + <section name="annotation"> + <param name="complete" value="true"/> + <param name="translation_table" value="4"/> + </section> + <section name="workflow"> + <param name="skip_analysis" value="--skip-trna,--skip-tmrna,--skip-rrna,--skip-ncrna,--skip-ncrna-region,--skip-crispr,--skip-cds,--skip-sorf,--skip-gap,--skip-ori"/> + </section> + <section name="output_files"> + <param name="output_selection" value="log_txt,sum_txt"/> + </section> + <output name="logfile" value="TEST_5/TEST_5.log" lines_diff="6"/> + <output name="summary_txt" value="TEST_5/TEST_5.txt" lines_diff="4"/> + </test> </tests> - - <help><![CDATA[ - usage: bakta [--db DB] [--min-contig-length MIN_CONTIG_LENGTH] - [--prefix PREFIX] [--output OUTPUT] [--genus GENUS] - [--species SPECIES] [--strain STRAIN] [--plasmid PLASMID] - [--complete] [--prodigal-tf PRODIGAL_TF] - [--translation-table {11,4}] [--gram {+,-,?}] [--locus LOCUS] - [--locus-tag LOCUS_TAG] [--keep-contig-headers] - [--replicons REPLICONS] [--compliant] [--proteins PROTEINS] - [--skip-trna] [--skip-tmrna] [--skip-rrna] [--skip-ncrna] - [--skip-ncrna-region] [--skip-crispr] [--skip-cds] [--skip-sorf] - [--skip-gap] [--skip-ori] [--help] [--verbose] - [--threads THREADS] [--tmp-dir TMP_DIR] [--version] - <genome> - -Rapid & standardized annotation of bacterial genomes, MAGs & plasmids + <help><![CDATA[**What it does** + Bakta is a tool for the rapid & standardized annotation of bacterial genomes and plasmids from both isolates and MAGs. -positional arguments: - <genome> Genome sequences in (zipped) fasta format - -Input / Output: - --db DB, -d DB Database path (default = <bakta_path>/db). Can also be - provided as BAKTA_DB environment variable. - --min-contig-length MIN_CONTIG_LENGTH, -m MIN_CONTIG_LENGTH - Minimum contig size (default = 1; 200 in compliant - mode) - --prefix PREFIX, -p PREFIX - Prefix for output files - --output OUTPUT, -o OUTPUT - Output directory (default = current working directory) - -Organism: - --genus GENUS Genus name - --species SPECIES Species name - --strain STRAIN Strain name - --plasmid PLASMID Plasmid name + *Comprehensive & taxonomy-independent database* + Bakta provides a large and taxonomy-independent database using UniProt's entire UniRef protein sequence cluster universe. -Annotation: - --complete All sequences are complete replicons (chromosome/plasmid[s]) - --prodigal-tf PRODIGAL_TF Path to existing Prodigal training file to use for CDS prediction + *Protein sequence identification* + Bakta exactly identifies known identical protein sequences (IPS) from RefSeq and UniProt + allowing the fine-grained annotation of gene alleles (AMR) or closely related but distinct protein families. + This is achieved via an alignment-free sequence identification (AFSI) approach + using full-length MD5 protein sequence hash digests. + *Small proteins/short open reading frames* + Bakta detects and annotates small proteins/short open reading frames (sORF). + + *Expert annotation systems* + To provide high quality annotations for certain proteins of higher interest, e.g. AMR & VF genes, + Bakta includes & merges different expert annotation systems. + Currently, Bakta uses NCBI's AMRFinderPlus for AMR gene annotations + as well as an generalized protein sequence expert system with distinct + coverage, identity and priority values for each sequence, currenlty comprising the VFDB as well as NCBI's BlastRules. + + *Comprehensive workflow* + Bakta annotates ncRNA cis-regulatory regions, oriC/oriV/oriT + and assembly gaps as well as standard feature types: tRNA, tmRNA, rRNA, ncRNA genes, CRISPR, CDS. - --translation-table {11,4} Translation table: 11/4 (default = 11) - --gram {+,-,?} Gram type for signal peptide predictions: +/-/? (default = ?) - --locus LOCUS Locus prefix (default = 'contig') - --locus-tag LOCUS_TAG Locus tag prefix (default = autogenerated) - --keep-contig-headers Keep original contig headers - --replicons REPLICONS Replicon information table (tsv/csv) - --compliant Force Genbank/ENA/DDJB compliance - --proteins PROTEINS Fasta file of trusted protein sequences for CDS annotation + *GFF3 & INSDC conform annotations* + Bakta writes GFF3 and INSDC-compliant (Genbank & EMBL) annotation files ready for submission + (checked via GenomeTools GFF3Validator, table2asn_GFF and ENA Webin-CLI for GFF3 and EMBL file formats, + respectively for representative genomes of all ESKAPE species). + + *Bacteria & plasmids* + Bakta was designed to annotate bacteria (isolates & MAGs) and plasmids, only. + **Input options** + 1. Choose a genome or assembly in fasta format to use bakta annotations + 2. Choose A version of the Bakta database + + **Organism options** + You can specify informations about analysed fasta as text input for: + - genus + - species + - strain + - plasmid -Workflow: - --skip-trna Skip tRNA detection & annotation - --skip-tmrna Skip tmRNA detection & annotation - --skip-rrna Skip rRNA detection & annotation - --skip-ncrna Skip ncRNA detection & annotation - --skip-ncrna-region Skip ncRNA region detection & annotation - --skip-crispr Skip CRISPR array detection & annotation - --skip-cds Skip CDS detection & annotation - --skip-sorf Skip sORF detection & annotation - --skip-gap Skip gap detection & annotation - --skip-ori Skip oriC/oriT detection & annotation + **Annotation options** + 1. You can specify if all sequences (chromosome or plasmids) are complete or not + 2. You can add your own prodigal training file for CDS predictionœ + 3. The translation table could be modified, default is the 11th for bacteria + 4. You can specify if bacteria is gram -/+ or unknonw (default value unknow) + 5. You can keep the name of contig present in the input file + 6. You can specify your own replicon table as a TSV/CSV file + 7. The compliance option is for ready to submit annotation file to Public database + as ENA, Genbank EMBL + 8. You can specify a protein sequence file for annotation in GenBank or fasta formats + Using the Fasta format, each reference sequence can be provided in a short or long format: + + # short: + >id gene~~~product~~~dbxrefs + MAQ... + + # long: + >id min_identity~~~min_query_cov~~~min_subject_cov~~~gene~~~product~~~dbxrefs + MAQ... -General: - --help, -h Show this help message and exit - --verbose, -v Print verbose information - --threads THREADS, -t THREADS - Number of threads to use (default = number of - available CPUs) - --tmp-dir TMP_DIR Location for temporary files (default = system - dependent auto detection) - --version show program's version number and exit + **Skip steps** + Some steps could be skiped: + - skip-trna Skip tRNA detection & annotation + - skip-tmrna Skip tmRNA detection & annotation + - skip-rrna Skip rRNA detection & annotation + - skip-ncrna Skip ncRNA detection & annotation + - skip-ncrna-region Skip ncRNA region detection & annotation + - skip-crispr Skip CRISPR array detection & annotation + - skip-cds Skip CDS detection & annotation + - skip-pseudo Skip pseudogene detection & annotation + - skip-sorf Skip sORF detection & annotation + - skip-gap Skip gap detection & annotation + - skip-ori Skip oriC/oriT detection & annotation - + **Output options** + Bakta produce numbers of output files, you can select what type of file you want: + - Summary of the annotation + - Annotated files + - Sequence files for nucleotide and/or amino acid ]]></help> <expand macro="citations"/> </tool>