Mercurial > repos > galaxy-australia > panaroo
diff panaroo.xml @ 0:50483f852947 draft
planemo upload for repository https://github.com/usegalaxy-au/tools-au/tree/master/tools/panaroo commit 0abf1ced6a6d8f85ed4fc244de2bc337f889aca2-dirty
| author | galaxy-australia |
|---|---|
| date | Fri, 11 Apr 2025 07:46:04 +0000 |
| parents | |
| children | b6a78d286482 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/panaroo.xml Fri Apr 11 07:46:04 2025 +0000 @@ -0,0 +1,469 @@ +<tool id="panaroo" name="Panaroo" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>A Bacterial Pangenome Analysis Pipeline</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="edam_ontology"/> + <expand macro="biotools"/> + <expand macro="requirements"/> + <stdio> + <exit_code range="1:" /> + <regex match="System..*Exception" + source="both" + level="fatal" + description="Error encountered" /> + </stdio> + <command><![CDATA[ + + mkdir outdir && + + #import re + #set input_directory = 'input_directory' + mkdir $input_directory && + #for $gff in $gff_input_collection: + #set identifier = re.sub('[^\s\w\-\\.]','_',str($gff.element_identifier)) + ln -fs '$gff' '$input_directory/$identifier' && + #end for + + panaroo + -t \${GALAXY_SLOTS:-2} + #if str($gen_code) != 'None': + --codon-table $gen_code + #end if + #if str($advanced.adv_options_selector) == "set": + #if $advanced.remove_invalid_gene + $advanced.remove_invalid_gene + #end if + -c '$advanced.matching_option.seq_threshold' + -f '$advanced.matching_option.peptide_threshold' + --len_dif_percent '$advanced.matching_option.length_diff_cutoff' + $advanced.matching_option.merge_paralogs + --search_radius '$advanced.refind_option.search_radius' + --refind_prop_match '$advanced.refind_option.refind_prop_match' + --refind-mode '$advanced.refind_option.refind_mode' + --min_trailing_support '$advanced.graph_correction_option.min_trailing_support' + --trailing_recursive '$advanced.graph_correction_option.trailing_recursive' + --edge_support_threshold '$advanced.graph_correction_option.edge_support_threshold' + --remove_by_consensus '$advanced.graph_correction_option.remove_by_consensus' + --high_var_flag '$advanced.graph_correction_option.high_var_flag' + --min_edge_support_sv '$advanced.graph_correction_option.min_edge_support_sv' + $advanced.graph_correction_option.all_seq_in_graph + $advanced.graph_correction_option.no_clean_edges + + #if $advanced.gene_alignment_option.a != 'None' + -a '$advanced.gene_alignment_option.a' + #end if + + #if '$advanced.gene_alignment_option.aligner' == 'mafft' + --aligner mafft + #else + --aligner '$advanced.gene_alignment_option.aligner' + #end if + #if $advanced.gene_alignment_option.core_subset != '' + --core_subset $advanced.gene_alignment_option.core_subset + #end if + #end if + -i $input_directory/*.gff + -o outdir + --clean-mode $mode + > '$log' && + mv outdir/gene_presence_absence.Rtab outdir/gene_presence_absence_rtab.Rtab && + mv outdir/combined_protein_cdhit_out.txt outdir/combined_protein_cdhit_out.fa && + 2>&1 + + ]]></command> + <inputs> + <param name="gff_input_collection" type="data_collection" format="gff" collection_type="list" label="GFF Input Collection" help="A list of gff files (i.e prokka)"/> + <param name="mode" type="select" label="The stringency mode at which to run panaroo" help="--clean-mode"> + <expand macro="clean_mode"/> + </param> + <param name="gen_code" type="select" label="the codon table user for translation" help="default: 11"> + <expand macro="genetic_code"/> + </param> + <conditional name="advanced"> + <param name="adv_options_selector" type="select" label="Set advanced options?" help="Provides additional controls"> + <option value="set">Set</option> + <option value="do_not_set" selected="True">Do not set</option> + </param> + <when value="set"> + <param argument="--remove-invalid-genes" name="remove_invalid_gene" type="boolean" truevalue="--remove-invalid-genes" falsevalue="" label="removes annotations that do not conform to the expected Prokka format such as those including premature stop codons" help="--remove-invalid-genes"/> + + <section name="matching_option" title="Matching" expanded="false"> + <param argument="--threshold" name="seq_threshold" type="float" value="0.98" label="sequence identity threshold" help="default: 0.98"/> + <param argument="--family_threshold" name="peptide_threshold" type="float" value="0.7" label="protein family sequence identity threshold" help="default: 0.7"/> + <param argument="--len_dif_percent" name="length_diff_cutoff" type="float" value="0.98" label="length difference cutoff" help="default: 0.98"/> + <param name="merge_paralogs" type="boolean" truevalue="--merge_paralogs" falsevalue="" checked="false" label="do not split paralogs" help="--merge_paralogs"/> + </section> + + <section name="refind_option" title="Refind" expanded="false"> + <param argument="--search_radius" type="integer" value="5000" label="Search radius" help="--search_radius (default: 5000)"/> + <param argument="--refind_prop_match" type="float" value="0.75" label="Gene proportion match" help="default: 0.75"/> + <param argument="--refind_mode" type="select" label="The stringency mode at which to re-find genes" help="default: default"> + <expand macro="refind_mode_option"/> + </param> + </section> + + <section name="graph_correction_option" title="Graph Correction" expanded="false"> + <param argument="--min_trailing_support" type="integer" value="2" label="Minimum cluster size to keep a gene called at the end of a contig" help="--min_traiiing_support [relexed mode : 2 is used]"/> + <param argument="--trailing_recursive" type="integer" value="1" label="Number of times to perform recursive trimming of low support nodes near the end of contigs" help="--trailing_recursive [relaxed mode: 1 is used]"/> + <param argument="--edge_support_threshold" type="integer" value="1" label="Edge support threshold" help="--edge_support_threshold [ Minimal edge 1 is used ]"/> + <param argument="--len_outlier_proportion" type="float" value="0.01" label="Length outlier support proportion" help="--length_outlier_support_proportion [default: 0.01]"/> + <param argument="--remove_by_consensus" type="boolean" truevalue="True" falsevalue="False" checked="False" label="Remove consensus" help="--remove_by_consensus [default: False]"/> + <param argument="--high_var_flag" type="integer" value="5" label="Highly variable gene region" help="--high_var_flag [default: 5]"/> + <param argument="--min_edge_support_sv" type="integer" value="2" label="Minimum edge support structural variants" help="--min_edge_support_sv [relaxed mode: 2 is used]"/> + <param argument="--all_seq_in_graph" type="boolean" truevalue="--all_seq_in_graph" falsevalue="" label="Retains all DNA sequence" help="--all_seq_in_graph [default: off]"/> + <param argument="--no_clean_edges" type="boolean" truevalue="--no_clean_edges" falsevalue="" label="Edge filtering in the final output graph" help="--no_clean_edges [default: off]"/> + </section> + + <section name="gene_alignment_option" title="Gene Alignment" expanded="false"> + <param argument="-a" type="select" label="Output alignments of core genes or all genes." help="-a [optional: core or pan; default: None"> + <expand macro="gene_alignment"/> + </param> + <param argument="--aligner" type="select" label="Specify an aligner" help="--aligner [mafft|prank|clustal][default: mafft]"> + <expand macro="gene_aligner"/> + </param> + <param name="codons" type="boolean" truevalue="true" falsevalue="false" checked="false" label="Generate codon alignments" help="--codons"/> + <param name="core_threshold" type="float" value="0.95" label="Core-genome sample threshold" help="--core_threshold [default: 0.95]"/> + <param argument="--core_subset" type="integer" value="" optional="true" label="Subset of the core genome to these many genes" help="--core_subset [default: all]"/> + <param name="core_entropy" type="float" value="0.1" label="Set the Block Mapping and Gathering with Entropy" help="--core_entropy_filter (threshold can be between 0.0 and 1.0) [default: Tukey outlier method]"/> + </section> + </when> + <when value="do_not_set"/> + </conditional> + </inputs> + <outputs> + <collection name="output" type="list" label="${tool.name} on ${on_string}: Pangenome output"> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>clstr)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>txt)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>gml)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>Rtab)" directory="outdir" format="tabular" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>csv)" directory="outdir" format="csv" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>fasta)" directory="outdir" format="fasta" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>fa)" directory="outdir" format="fasta" visible="false" /> + <filter>advanced['adv_options_selector'] != 'set'</filter> + </collection> + <collection name="output_advance" type="list" label="${tool.name} on ${on_string}: Pangenome output (advance)"> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>clstr)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>txt)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>gml)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>Rtab)" directory="outdir" format="tabular" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>csv)" directory="outdir" format="csv" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>fasta)" directory="outdir" format="fasta" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>fa)" directory="outdir" format="fasta" visible="false" /> + <filter>advanced['adv_options_selector'] == 'set' and advanced['gene_alignment_option']['a'] == 'None'</filter> + </collection> + <collection name="output_pangenome" type="list" label="${tool.name} on ${on_string}: Pangenome alignment output"> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>clstr)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>txt)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>gml)" directory="outdir" format="txt" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>Rtab)" directory="outdir" format="tabular" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>csv)" directory="outdir" format="csv" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>fasta)" directory="outdir" format="fasta" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>fa)" directory="outdir" format="fasta" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>aln)" directory="outdir" format="aln" visible="false" /> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>embl)" directory="outdir" format="embl" visible="false" /> + <filter>advanced['adv_options_selector'] == 'set' and advanced['gene_alignment_option']['a'] != 'None' </filter> + </collection> + <collection name="output_pangenome_fasta" type="list" label="${tool.name} on ${on_string}: Pangenom alignment fasta"> + <discover_datasets pattern="(?P<designation>.+)\.(?P<ext>fas)" directory="outdir/aligned_gene_sequences" format="fasta" visible="false" /> + <filter>advanced['adv_options_selector'] == 'set' and advanced['gene_alignment_option']['a'] != 'None'</filter> + </collection> + <data name="log" format="txt" label="${tool.name} on ${on_string}: log"/> + </outputs> + <tests> + <!-- run panaroo with default parameters (i.e panaroo -t 2 -i *.gff -o default \-\-clean-mode strict \-\-remove-invalid-genes) --> + <test expect_num_outputs="2"> + <param name="gen_code" value="11"/> + <param name="mode" value="strict"/> + <param name="adv_options_selector" value="do_not_set"/> + <param name="gff_input_collection"> + <collection type="list"> + <element name="10_small.gff" value="10_small.gff"/> + <element name="11_small.gff" value="11_small.gff"/> + </collection> + </param> + <output_collection name="output" count="13" type="list"> + <element name="combined_DNA_CDS" file="combined_DNA_CDS.fasta" ftype="fa"> + <assert_contents> + <has_n_lines n="18206"/> + </assert_contents> + </element> + <element name="combined_protein_CDS" file="combined_protein_CDS.fasta" ftype="fa"> + <assert_contents> + <has_n_lines n="7048"/> + </assert_contents> + </element> + <element name="combined_protein_cdhit_out" file="combined_protein_cdhit_out.fa" ftype="fa"> + <assert_contents> + <has_n_lines n="5119"/> + </assert_contents> + </element> + <element name="gene_data" file="gene_data.csv" ftype="csv"> + <assert_contents> + <has_text text="KPLBOJCC_00001"/> + <has_text text="NCFNLLIC_00549" /> + </assert_contents> + </element> + <element name="gene_presence_absence" file="gene_presence_absence.csv" ftype="csv"> + <assert_contents> + <has_text text="dcd"/> + <has_text text="trmB"/> + <has_text text="betI_1"/> + </assert_contents> + </element> + <element name="gene_presence_absence_roary" file="gene_presence_absence_roary.csv" ftype="csv"> + <assert_contents> + <has_text text="kstR2_1"/> + <has_text text="ybgJ"/> + </assert_contents> + </element> + <element name="pan_genome_reference" file="pan_genome_reference.fa" ftype="fa"> + <assert_contents> + <has_n_lines n="5055"/> + </assert_contents> + </element> + <element name="struct_presence_absence" file="struct_presence_absence.Rtab" ftype="Rtab"> + <assert_contents> + <has_line_matching expression="Gene\s+10_small\s+11_small"/> + </assert_contents> + </element> + <element name="summary_statistics" file="summary_statistics.txt" ftype="txt"> + <assert_contents> + <has_line line="Core genes	(99% <= strains <= 100%)	251"/> + <has_line line="Total genes	(0% <= strains <= 100%)	251"/> + </assert_contents> + </element> + </output_collection> + <output name="log"> + <assert_contents> + <has_text text="pre-processing gff3 files..."/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="2"> + <param name="gen_code" value="11"/> + <param name="mode" value="strict"/> + <param name="adv_options_selector" value="set"/> + <param name="a" value="None"/> + <param name="gff_input_collection"> + <collection type="list"> + <element name="10_small.gff" value="10_small.gff"/> + <element name="11_small.gff" value="11_small.gff"/> + </collection> + </param> + <output_collection name="output_advance" count="13" type="list"> + <element name="combined_DNA_CDS" file="advance/combined_DNA_CDS.fasta" ftype="fa"> + <assert_contents> + <has_n_lines n="18206"/> + </assert_contents> + </element> + <element name="combined_protein_CDS" file="advance/combined_protein_CDS.fasta" ftype="fa"> + <assert_contents> + <has_n_lines n="7048"/> + </assert_contents> + </element> + <element name="combined_protein_cdhit_out" file="advance/combined_protein_cdhit_out.fa" ftype="fa"> + <assert_contents> + <has_n_lines n="5119"/> + </assert_contents> + </element> + <element name="gene_data" ftype="csv"> + <assert_contents> + <has_n_lines n="980"/> + <has_n_columns sep="," n="8"/> + <has_text text="KPLBOJCC_00003"/> + <has_text text="NCFNLLIC_00003"/> + </assert_contents> + </element> + <element name="gene_presence_absence" file="advance/gene_presence_absence.csv" ftype="csv"> + <assert_contents> + <has_text text="recB"/> + <has_text text="recC"/> + <has_text text="rpoB"/> + </assert_contents> + </element> + <element name="gene_presence_absence_roary" file="advance/gene_presence_absence_roary.csv" ftype="csv"> + <assert_contents> + <has_text text="ctpI_2"/> + <has_text text="amiD_1"/> + </assert_contents> + </element> + <element name="pan_genome_reference" file="advance/pan_genome_reference.fa" ftype="fa"> + <assert_contents> + <has_n_lines n="13120"/> + </assert_contents> + </element> + <element name="struct_presence_absence" file="advance/struct_presence_absence.Rtab" ftype="Rtab"> + <assert_contents> + <has_line_matching expression="Gene\s+10_small\s+11_small"/> + </assert_contents> + </element> + <element name="summary_statistics" file="advance/summary_statistics.txt" ftype="txt"> + <assert_contents> + <has_line line="Core genes	(99% <= strains <= 100%)	251"/> + <has_line line="Shell genes	(15% <= strains < 95%)	475"/> + <has_line line="Total genes	(0% <= strains <= 100%)	726"/> + </assert_contents> + </element> + </output_collection> + <output name="log"> + <assert_contents> + <has_text text="pre-processing gff3 files..."/> + </assert_contents> + </output> + </test> + <test expect_num_outputs="3"> + <param name="gen_code" value="11"/> + <param name="mode" value="strict"/> + <param name="adv_options_selector" value="set"/> + <param name="a" value="core"/> + <param name="gff_input_collection"> + <collection type="list"> + <element name="10_small.gff" value="10_small.gff"/> + <element name="11_small.gff" value="11_small.gff"/> + </collection> + </param> + <output_collection name="output_pangenome" count="18" type="list"> + <element name="combined_DNA_CDS" file="core/combined_DNA_CDS.fasta" ftype="fa"> + <assert_contents> + <has_n_lines n="18206"/> + </assert_contents> + </element> + <element name="combined_protein_CDS" file="core/combined_protein_CDS.fasta" ftype="fa"> + <assert_contents> + <has_n_lines n="7048"/> + </assert_contents> + </element> + <element name="combined_protein_cdhit_out" file="core/combined_protein_cdhit_out.fa" ftype="fa"> + <assert_contents> + <has_n_lines n="5119"/> + </assert_contents> + </element> + <element name="summary_statistics" file="core/summary_statistics.txt" ftype="txt"> + <assert_contents> + <has_line line="Core genes	(99% <= strains <= 100%)	251"/> + <has_line line="Shell genes	(15% <= strains < 95%)	475"/> + <has_line line="Total genes	(0% <= strains <= 100%)	726"/> + </assert_contents> + </element> + <element name="struct_presence_absence" file="core/struct_presence_absence.Rtab" ftype="Rtab"> + <assert_contents> + <has_line_matching expression="Gene\s+10_small\s+11_small"/> + </assert_contents> + </element> + <element name="alignment_entropy" file="core/alignment_entropy.csv" ftype="csv"> + <assert_contents> + <has_text text="stf0.aln,0.0"/> + <has_text text="bglB.aln,0.0"/> + </assert_contents> + </element> + <element name="gene_data" ftype="csv"> + <assert_contents> + <has_n_lines n="980"/> + <has_n_columns sep="," n="8"/> + <has_text text="KPLBOJCC_00003"/> + <has_text text="NCFNLLIC_00003"/> + </assert_contents> + </element> + <element name="pan_genome_reference" file="core/pan_genome_reference.fa" ftype="fa"> + <assert_contents> + <has_n_lines n="13120"/> + </assert_contents> + </element> + <element name="gene_presence_absence" file="core/gene_presence_absence.csv" ftype="csv"> + <assert_contents> + <has_text text="recB"/> + <has_text text="recC"/> + <has_text text="rpoB"/> + </assert_contents> + </element> + <element name="gene_presence_absence_roary" file="core/gene_presence_absence_roary.csv" ftype="csv"> + <assert_contents> + <has_text text="ctpI_2"/> + <has_text text="amiD_1"/> + </assert_contents> + </element> + <element name="core_gene_alignment_filtered" ftype="aln"> + <assert_contents> + <has_size value="568632" delta="1000"/> + </assert_contents> + </element> + <element name="core_gene_alignment" ftype="aln"> + <assert_contents> + <has_size value="569962" delta="1000"/> + </assert_contents> + </element> + <element name="core_alignment_header" file="core/core_alignment_header.embl" ftype="embl"> + <assert_contents> + <has_text text="ID Genome standard; DNA; PRO; 1234 BP."/> + <has_text text="hisB_1.aln"/> + </assert_contents> + </element> + <element name="core_alignment_filtered_header" file="core/core_alignment_filtered_header.embl" ftype="embl"> + <assert_contents> + <has_text text="ID Genome standard; DNA; PRO; 1234 BP."/> + <has_text text=" FT feature 79606..80691 "/> + </assert_contents> + </element> + </output_collection> + <output_collection name="output_pangenome_fasta" count="251"/> + <output name="log"> + <assert_contents> + <has_text text="pre-processing gff3 files..."/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +Panaroo_ is A Bacterial Pangenome Analysis Pipeline. Panaroo builds a full graphical representation of the pangenome, where nodes are clusters of orthologous genes (COGs) and two nodes are connected by an edge if they are adjacent on a contig in any sample from the population. Using this graphical representation, Panaroo corrects for errors introduced during annotation by collapsing diverse gene families, filtering contamination, merging fragmented gene segments and refinding missing genes. + +**INPUTS** +Panaroo now supports multiple input formats. To use non-standard GFF3 files you must profile the input file as a list in a text file (one per line). Separate GFF and FASTA files can be provided per isolate by providing each file delimited by a space or a tab. Genbank file formats are also supported with extensions '.gbk', '.gb' or '.gbff'. These must compliant with Genbank/ENA/DDJB. This can be forced in Prokka by specifying the --compliance parameter. + + - a list of gff format in a collection + +**OUTPUTS** + + - combined_protein_cdhit_out.txt + - combined_protein_cdhit_out.txt.clstr + - pre_filt_graph.gml + - gene_data.csv + - combined_protein_CDS.fasta + - combined_DNA_CDS.fasta + - gene_presence_absence_rtab.Rtab + - gene_presence_absence_roary.csv + - gene_presence_absence.csv + - summary_statistics.txt + - pan_genome_reference.fa + - struct_presence_absence.Rtab + - final_graph.gml + +**OUTPUTS with Advance parameters** + + - combined_protein_cdhit_out.txt + - combined_protein_cdhit_out.txt.clstr + - pre_filt_graph.gml + - gene_data.csv + - combined_protein_CDS.fasta + - combined_DNA_CDS.fasta + - gene_presence_absence_rtab.Rtab + - gene_presence_absence_roary.csv + - gene_presence_absence.csv + - summary_statistics.txt + - pan_genome_reference.fa + - struct_presence_absence.Rtab + - final_graph.gml + - core_gene_alignment + - core_gene_alignment_filtered + - core_alignment_filtered_header + - core_alignment_header + - a collection of fasta files + +.. _Panaroo: https://gthlab.au/panaroo/#/gettingstarted/quickstart + + ]]></help> + <citations> + <citation type="doi">10.1186/s13059-020-02090-4</citation> + </citations> +</tool> +
