Mercurial > repos > iuc > pirate
changeset 0:ef07a43227a6 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/pirate commit fd6bda0b520e35e43c29f35c10d5b0704f6f4f82
| author | iuc |
|---|---|
| date | Wed, 04 Feb 2026 11:52:07 +0000 |
| parents | |
| children | |
| files | macros.xml pirate.xml |
| diffstat | 2 files changed, 466 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/macros.xml Wed Feb 04 11:52:07 2026 +0000 @@ -0,0 +1,25 @@ +<macros> + <token name="@TOOL_VERSION@">1.0.5</token> + <token name="@VERSION_SUFFIX@">0</token> + <token name="@PROFILE@">25.0</token> + <xml name="requirements"> + <requirements> + <requirement type="package" version="@TOOL_VERSION@">pirate</requirement> + </requirements> + </xml> + <xrefs> + <xref type="bio.tools">PIRATE</xref> + </xrefs> + <xml name="creator"> + <creator> + <person givenName="Saim" familyName="Momin" url="https://github.com/SaimMomin12"/> + <person givenName="Fernando" familyName="Martin Garcia" url="https://github.com/FMG0411"/> + <organization name="Galaxy Europe"/> + </creator> + </xml> + <xml name="citations"> + <citations> + <citation type="doi">10.1093/gigascience/giz119</citation> + </citations> + </xml> +</macros> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pirate.xml Wed Feb 04 11:52:07 2026 +0000 @@ -0,0 +1,441 @@ +<tool id="pirate" name="PIRATE" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@"> + <description>Pangenome Iterative Refinement and Threshold Evaluation</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements"/> + <command detect_errors="exit_code"><![CDATA[ + ## Create input directory and symlink GFFs + #import re + mkdir input_gffs && + #for $f in $input_files: + #set identifier = re.sub('[^\s\w\-\\.]','_',str($f.element_identifier)) + ln -s '$f' 'input_gffs/$identifier' && + #end for + + PIRATE + --input input_gffs/ + --steps '$steps' + --features '$features' + --min-len $min_len + $nucl + $para_off + $classify_off + $align + $rplots + --threads "\${GALAXY_SLOTS:-8}" + ## Pan-opt and advanced parameters + #if str($global_opts.pan_opts_conditional.enable_pan_opt) == 'yes': + --pan-opt ' + --perc $global_opts.pan_opts_conditional.perc + --cd-low $global_opts.pan_opts_conditional.cdhit_opts.cd_low + --cd-step $global_opts.pan_opts_conditional.cdhit_opts.cd_step + $global_opts.pan_opts_conditional.cdhit_opts.cd_core_off + --evalue $global_opts.pan_opts_conditional.blast_opts.evalue + $global_opts.pan_opts_conditional.blast_opts.diamond + $global_opts.pan_opts_conditional.blast_opts.diamond_split + --hsp-len $global_opts.pan_opts_conditional.blast_opts.hsp_len + --flat $global_opts.pan_opts_conditional.mcl_opts.flat + ' + #end if + --output results/ + ]]></command> + <inputs> + <param argument="--input" name="input_files" type="data" format="gff,gff3" multiple="true" label="Input GFF files" help="Select all GFF files for pangenome construction"/> + + <section name="global_opts" title="Global Options" expanded="true"> + <param argument="--steps" type="text" value="50,60,70,80,90,95,98" label="Identity thresholds" help="Comma-separated list of % identity thresholds"/> + <param argument="--features" type="text" value="CDS" label="Features" help="Features to use (e.g., CDS, tRNA). Multiple could be specified using commas."/> + <param argument="--min-len" type="integer" value="120" min="0" label="Minimum feature length"/> + <param argument="--nucl" type="boolean" truevalue="--nucl" falsevalue="" checked="false" label="Do not translate to Amino Acids" /> + + <conditional name="pan_opts_conditional"> + <param name="enable_pan_opt" type="select" label="Enable advanced pangenome options" help="Enable --pan-opt and advanced pangenome parameters"> + <option value="no" selected="true">No</option> + <option value="yes">Yes</option> + </param> + <when value="yes"> + <param argument="--perc" type="integer" value="98" optional="true" min="0" max="100" label="Single % identity threshold to use for pangenome" help="Single % identity threshold to use for pangenome"/> + + <section name="cdhit_opts" title="CD-HIT Options" expanded="false"> + <param argument="--cd-low" type="integer" value="98" min="0" max="100" label="CD-HIT lowest percentage id" help="Default: 98"/> + <param argument="--cd-step" type="float" value="0.5" min="0" label="CD-HIT step size" help="Default: 0.5"/> + <param argument="--cd-core-off" type="boolean" truevalue="--cd-core-off" falsevalue="" checked="false" label="Don't extract core families during CD-HIT clustering" help="Default: Extract core families"/> + </section> + + <section name="blast_opts" title="BLAST Options" expanded="false"> + <param argument="--evalue" type="float" value="1E-6" min="0" label="E-value for BLAST hit filtering" help="Default: 1E-6"/> + <param argument="--diamond" type="boolean" truevalue="--diamond" falsevalue="" checked="false" label="Use DIAMOND instead of BLAST. Incompatible with --nucl"/> + <param argument="--diamond-split" type="boolean" truevalue="--diamond-split" falsevalue="" checked="false" label="Split DIAMOND files into batches"/> + <param argument="--hsp-len" type="float" value="0" min="0" max="1" label="Remove BLAST HSPs proportion threshold" help="Remove BLAST HSPs that are less than this proportion of query length."/> + </section> + + <section name="mcl_opts" title="MCL Options" expanded="false"> + <param argument="--flat" type="float" value="1.5" min="0" label="MCL inflation value"/> + </section> + </when> + <when value="no"/> + </conditional> + </section> + + <section name="para_opts" title="Paralog Classification" expanded="false"> + <param argument="--para-off" type="boolean" truevalue="--para-off" falsevalue="" checked="false" label="Switch off paralog identification"/> + <param argument="--classify-off" type="boolean" truevalue="--classify-off" falsevalue="" checked="false" label="Do not classify paralogs"/> + </section> + + <section name="output_opts" title="Output Options" expanded="true"> + <param argument="--align" type="boolean" truevalue="--align" falsevalue="" checked="false" label="Produce alignments" help="Align all genes and produce core/pangenome alignments."/> + <param argument="--rplots" type="boolean" truevalue="--rplots" falsevalue="" checked="false" label="Generate R plots" help="Plot summaries using R."/> + </section> + </inputs> + + <outputs> + <data name="pangenome_summary" format="txt" from_work_dir="results/PIRATE.pangenome_summary.txt" label="${tool.name} on ${on_string}: Pangenome Summary"/> + <data name="pirate_gene_families" format="tsv" from_work_dir="results/PIRATE.gene_families.ordered.tsv" label="${tool.name} on ${on_string}: Tabular summary of all gene families"/> + <data name="pirate_unique_alleles" format="tsv" from_work_dir="results/PIRATE.unique_alleles.tsv" label="${tool.name} on ${on_string}: Tabular summary of all unique alleles"/> + <data name="pirate_presence_absence_fasta" format="fasta" from_work_dir="results/binary_presence_absence.fasta" label="${tool.name} on ${on_string}: Binary Presence/Absence data"/> + <data name="pirate_presence_absence_nwk" format="newick" from_work_dir="results/binary_presence_absence.nwk" label="${tool.name} on ${on_string}: Binary Presence/Absence Newick data"/> + <data name="pangenome_gfa" format="gfa1" from_work_dir="results/pangenome.gfa" label="${tool.name} on ${on_string}: Pangenome GFA"/> + <data name="pirate_rep_sequences_ffn" format="fasta" from_work_dir="results/representative_sequences.ffn" label="${tool.name} on ${on_string}: Representative sequences for each gene family as nucleotide"/> + <data name="pirate_rep_sequences_faa" format="fasta" from_work_dir="results/representative_sequences.faa" label="${tool.name} on ${on_string}: Representative sequences for each gene family as amino acid"/> + + <data name="pirate_core_aln" format="fasta" from_work_dir="results/core_alignment.fasta" label="${tool.name} on ${on_string}: Core Alignment FASTA"> + <filter>output_opts['align']</filter> + </data> + + <data name="pirate_core_gff" format="gff" from_work_dir="results/core_alignment.gff" label="${tool.name} on ${on_string}: Core Alignment GFF"> + <filter>output_opts['align']</filter> + </data> + + <data name="pirate_pangenome_aln" format="fasta" from_work_dir="results/pangenome_alignment.fasta" label="${tool.name} on ${on_string}: Pangenome Alignment FASTA"> + <filter>output_opts['align']</filter> + </data> + + <data name="pirate_pangenome_gff" format="gff" from_work_dir="results/pangenome_alignment.gff" label="${tool.name} on ${on_string}: Pangenome Alignment GFF"> + <filter>output_opts['align']</filter> + </data> + + <data name="pirate_plots" format="pdf" from_work_dir="results/PIRATE_plots.pdf" label="${tool.name} on ${on_string}: Summary plots of the PIRATE pangenome"> + <filter>output_opts['rplots']</filter> + </data> + </outputs> + + <tests> + <!-- Test 1 : Default parameters--> + <test expect_num_outputs="8"> + <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/> + <section name="global_opts"> + <param name="steps" value="50,60,70,80,90,95,98"/> + <param name="features" value="CDS"/> + <param name="min_len" value="120"/> + <conditional name="pan_opts_conditional"> + <param name="enable_pan_opt" value="no"/> + </conditional> + </section> + <output name="pangenome_summary" ftype="txt"> + <assert_contents> + <has_line line="# 4 gene families in 2 genomes."/> + <has_n_lines n="13"/> + </assert_contents> + </output> + <output name="pirate_gene_families" ftype="tsv"> + <assert_contents> + <has_n_lines n="5"/> + </assert_contents> + </output> + <output name="pirate_unique_alleles" ftype="tsv"> + <assert_contents> + <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_fasta" ftype="fasta"> + <assert_contents> + <has_line line=">HO_5096_0412"/> + <has_n_lines n="6"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_nwk" ftype="newick"> + <assert_contents> + <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/> + <has_n_lines n="1"/> + </assert_contents> + </output> + <output name="pangenome_gfa" ftype="gfa1"> + <assert_contents> + <has_line_matching expression="S\tg01\tA\tRC:i:2"/> + <has_n_lines n="7"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_ffn" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_faa" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + </test> + <!-- Test 2 : testing align parameter--> + <test expect_num_outputs="12"> + <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/> + <section name="global_opts"> + <param name="steps" value="50,60,70,80,90,95,98"/> + <param name="features" value="CDS"/> + <param name="min_len" value="120"/> + <conditional name="pan_opts_conditional"> + <param name="enable_pan_opt" value="no"/> + </conditional> + </section> + <section name="output_opts"> + <param name="align" value="true"/> + </section> + <output name="pangenome_summary" ftype="txt"> + <assert_contents> + <has_line line="# 4 gene families in 2 genomes."/> + <has_n_lines n="13"/> + </assert_contents> + </output> + <output name="pirate_gene_families" ftype="tsv"> + <assert_contents> + <has_n_lines n="5"/> + </assert_contents> + </output> + <output name="pirate_unique_alleles" ftype="tsv"> + <assert_contents> + <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_fasta" ftype="fasta"> + <assert_contents> + <has_line line=">HO_5096_0412"/> + <has_n_lines n="6"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_nwk" ftype="newick"> + <assert_contents> + <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/> + <has_n_lines n="1"/> + </assert_contents> + </output> + <output name="pangenome_gfa" ftype="gfa1"> + <assert_contents> + <has_line_matching expression="S\tg01\tA\tRC:i:2"/> + <has_n_lines n="7"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_ffn" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_faa" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_pangenome_aln" ftype="fasta"> + <assert_contents> + <has_line line=">HO_5096_0412"/> + <has_n_lines n="4"/> + </assert_contents> + </output> + <output name="pirate_pangenome_gff" ftype="gff"> + <assert_contents> + <has_line_matching expression="##sequence-region Pangenome 1 3945"/> + <has_n_lines n="6"/> + </assert_contents> + </output> + <output name="pirate_core_aln" ftype="fasta"> + <assert_contents> + <has_line line=">HO_5096_0412"/> + <has_n_lines n="4"/> + </assert_contents> + </output> + <output name="pirate_core_gff" ftype="gff"> + <assert_contents> + <has_line_matching expression="##sequence-region Pangenome 1 2550"/> + <has_n_lines n="5"/> + </assert_contents> + </output> + </test> + + <!-- Advanced pangenome options for pan-genome analysis --> + <test expect_num_outputs="8"> + <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/> + <section name="global_opts"> + <param name="steps" value="50,60,70,80,90,95,98"/> + <param name="features" value="CDS"/> + <param name="min_len" value="120"/> + <conditional name="pan_opts_conditional"> + <param name="enable_pan_opt" value="yes"/> + <param name="perc" value="95"/> + <section name="cdhit_opts"> + <param name="cd_low" value="98"/> + <param name="cd_step" value="0.5"/> + <param name="cd_core_off" value="true"/> + </section> + <section name="blast_opts"> + <param name="evalue" value="0.00001"/> + <param name="diamond" value="true"/> + <param name="diamond_split" value="true"/> + <param name="hsp_len" value="0.1"/> + </section> + <section name="mcl_opts"> + <param name="flat" value="2.0"/> + </section> + </conditional> + </section> + <section name="output_opts"> + <param name="align" value="false"/> + </section> + <output name="pangenome_summary" ftype="txt"> + <assert_contents> + <has_line line="# 4 gene families in 2 genomes."/> + <has_n_lines n="13"/> + </assert_contents> + </output> + <output name="pirate_gene_families" ftype="tsv"> + <assert_contents> + <has_n_lines n="5"/> + </assert_contents> + </output> + <output name="pirate_unique_alleles" ftype="tsv"> + <assert_contents> + <has_line_matching expression="g01_09\s+g01\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+\s+MRSA252_00002"/> + <has_n_lines n="9"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_fasta" ftype="fasta"> + <assert_contents> + <has_line line=">HO_5096_0412"/> + <has_n_lines n="6"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_nwk" ftype="newick"> + <assert_contents> + <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/> + <has_n_lines n="1"/> + </assert_contents> + </output> + <output name="pangenome_gfa" ftype="gfa1"> + <assert_contents> + <has_line_matching expression="S\tg01\tA\tRC:i:2"/> + <has_n_lines n="7"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_ffn" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_faa" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + </test> + + <!-- Test 04: Tesing PDF Reports --> + <test expect_num_outputs="9"> + <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/> + <section name="global_opts"> + <param name="steps" value="50,60,70,80,90,95,98"/> + <param name="features" value="CDS"/> + <param name="min_len" value="120"/> + <conditional name="pan_opts_conditional"> + <param name="enable_pan_opt" value="no"/> + </conditional> + </section> + <section name="output_opts"> + <param name="rplots" value="true"/> + </section> + <output name="pangenome_summary" ftype="txt"> + <assert_contents> + <has_line line="# 4 gene families in 2 genomes."/> + <has_n_lines n="13"/> + </assert_contents> + </output> + <output name="pirate_gene_families" ftype="tsv"> + <assert_contents> + <has_n_lines n="5"/> + </assert_contents> + </output> + <output name="pirate_unique_alleles" ftype="tsv"> + <assert_contents> + <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_fasta" ftype="fasta"> + <assert_contents> + <has_line line=">HO_5096_0412"/> + <has_n_lines n="6"/> + </assert_contents> + </output> + <output name="pirate_presence_absence_nwk" ftype="newick"> + <assert_contents> + <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/> + <has_n_lines n="1"/> + </assert_contents> + </output> + <output name="pangenome_gfa" ftype="gfa1"> + <assert_contents> + <has_line_matching expression="S\tg01\tA\tRC:i:2"/> + <has_n_lines n="7"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_ffn" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_rep_sequences_faa" ftype="fasta"> + <assert_contents> + <has_n_lines n="8"/> + </assert_contents> + </output> + <output name="pirate_plots" ftype="pdf"> + <assert_contents> + <has_size value="14397" delta="100"/> + </assert_contents> + </output> + </test> + + </tests> + <help><![CDATA[ +**PIRATE** (Pangenome Iterative Refinement and Threshold Evaluation) + +PIRATE is a pangenomics tool that allows for the iterative refinement of pangenomes using multiple identity thresholds. It is designed to handle highly divergent pangenomes and identify orthologs across different evolutionary scales. + +**INPUTS** + +- A collection of gff3 files. + +**CORE OUTPUTS** + +1. Pangenome Summary - Summary statistics of gene number and frequency in the pangenome +2. Tabular summary of all gene families - Complete gene family catalog with one row per family. Families split during paralog detection are labeled with underscores and numbers (e.g., g0001_1, g0001_2). Families are ordered by syntenic position in the pangenome graph. +3. Tabular summary of all unique alleles - Catalog of unique alleles per gene family, defined as distinct MCL sub-clusters at higher identity thresholds +4. Binary Presence/Absence data - Binary gene family presence/absence matrix in FASTA format +5. Binary Presence/Absence Newick data - FastTree phylogeny constructed from the binary presence/absence matrix +6. Pangenome GFA - Network representation of gene family connections in GFA format (can be visualized with Bandage) +7. Representative sequences for each gene family as nucleotide - Nucleotide sequences with the longest sequence per family selected as representative (genomes ordered alphabetically) +8. Representative sequences for each gene family as amino acid - Amino acid sequences corresponding to the nucleotide set + +**OPTIONAL OUTPUTS** + +1. Core Alignment FASTA - MAFFT-aligned core genome sequences, ordered by gene family table. Reverse-translated when created from CDS. Multi-copy genes represented as ? characters. +2. Core Alignment GFF - Annotation coordinates and gene/product information for the core alignment +3. Pangenome Alignment FASTA - MAFFT-aligned full pangenome sequences with the same characteristics as core alignment +4. Pangenome Alignment GFF - Annotation coordinates for the pangenome alignment +5. Summary plots of the PIRATE pangenome - Visualization plots summarizing the pangenome analysis + + + ]]></help> + <expand macro="citations"/> + <expand macro="creator"/> +</tool> \ No newline at end of file
