view pirate.xml @ 0:ef07a43227a6 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/pirate commit fd6bda0b520e35e43c29f35c10d5b0704f6f4f82
author iuc
date Wed, 04 Feb 2026 11:52:07 +0000
parents
children
line wrap: on
line source

<tool id="pirate" name="PIRATE" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Pangenome Iterative Refinement and Threshold Evaluation</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <command detect_errors="exit_code"><![CDATA[
        ## Create input directory and symlink GFFs
        #import re
        mkdir input_gffs &&
        #for $f in $input_files:
            #set identifier = re.sub('[^\s\w\-\\.]','_',str($f.element_identifier))
            ln -s '$f' 'input_gffs/$identifier' &&
        #end for
        
        PIRATE
            --input input_gffs/
            --steps '$steps'
            --features '$features'
            --min-len $min_len
            $nucl
            $para_off
            $classify_off
            $align
            $rplots
            --threads "\${GALAXY_SLOTS:-8}"
            ## Pan-opt and advanced parameters
            #if str($global_opts.pan_opts_conditional.enable_pan_opt) == 'yes':
                --pan-opt '
                --perc $global_opts.pan_opts_conditional.perc
                --cd-low $global_opts.pan_opts_conditional.cdhit_opts.cd_low
                --cd-step $global_opts.pan_opts_conditional.cdhit_opts.cd_step
                $global_opts.pan_opts_conditional.cdhit_opts.cd_core_off
                --evalue $global_opts.pan_opts_conditional.blast_opts.evalue
                $global_opts.pan_opts_conditional.blast_opts.diamond
                $global_opts.pan_opts_conditional.blast_opts.diamond_split
                --hsp-len $global_opts.pan_opts_conditional.blast_opts.hsp_len
                --flat $global_opts.pan_opts_conditional.mcl_opts.flat
                '
            #end if
            --output results/
    ]]></command>
    <inputs>
        <param argument="--input" name="input_files" type="data" format="gff,gff3" multiple="true" label="Input GFF files" help="Select all GFF files for pangenome construction"/>

        <section name="global_opts" title="Global Options" expanded="true">
            <param argument="--steps" type="text" value="50,60,70,80,90,95,98" label="Identity thresholds" help="Comma-separated list of % identity thresholds"/>
            <param argument="--features" type="text" value="CDS" label="Features" help="Features to use (e.g., CDS, tRNA). Multiple could be specified using commas."/>
            <param argument="--min-len" type="integer" value="120" min="0" label="Minimum feature length"/>
            <param argument="--nucl" type="boolean" truevalue="--nucl" falsevalue="" checked="false" label="Do not translate to Amino Acids" />
            
            <conditional name="pan_opts_conditional">
                <param name="enable_pan_opt" type="select" label="Enable advanced pangenome options" help="Enable --pan-opt and advanced pangenome parameters">
                    <option value="no" selected="true">No</option>
                    <option value="yes">Yes</option>
                </param>
                <when value="yes">
                    <param argument="--perc" type="integer" value="98" optional="true" min="0" max="100" label="Single % identity threshold to use for pangenome" help="Single % identity threshold to use for pangenome"/>
                
                    <section name="cdhit_opts" title="CD-HIT Options" expanded="false">
                        <param argument="--cd-low" type="integer" value="98" min="0" max="100" label="CD-HIT lowest percentage id" help="Default: 98"/>
                        <param argument="--cd-step" type="float" value="0.5" min="0" label="CD-HIT step size" help="Default: 0.5"/>
                        <param argument="--cd-core-off" type="boolean" truevalue="--cd-core-off" falsevalue="" checked="false" label="Don't extract core families during CD-HIT clustering" help="Default: Extract core families"/>
                    </section>
                
                    <section name="blast_opts" title="BLAST Options" expanded="false">
                        <param argument="--evalue" type="float" value="1E-6" min="0" label="E-value for BLAST hit filtering" help="Default: 1E-6"/>
                        <param argument="--diamond" type="boolean" truevalue="--diamond" falsevalue="" checked="false" label="Use DIAMOND instead of BLAST. Incompatible with --nucl"/>
                        <param argument="--diamond-split" type="boolean" truevalue="--diamond-split" falsevalue="" checked="false" label="Split DIAMOND files into batches"/>
                        <param argument="--hsp-len" type="float" value="0" min="0" max="1" label="Remove BLAST HSPs proportion threshold" help="Remove BLAST HSPs that are less than this proportion of query length."/>
                    </section>
                
                    <section name="mcl_opts" title="MCL Options" expanded="false">
                        <param argument="--flat" type="float" value="1.5" min="0" label="MCL inflation value"/>
                    </section>
                </when>
                <when value="no"/>
            </conditional>
        </section>
        
        <section name="para_opts" title="Paralog Classification" expanded="false">
            <param argument="--para-off" type="boolean" truevalue="--para-off" falsevalue="" checked="false" label="Switch off paralog identification"/>
            <param argument="--classify-off" type="boolean" truevalue="--classify-off" falsevalue="" checked="false" label="Do not classify paralogs"/>
        </section>

        <section name="output_opts" title="Output Options" expanded="true">
            <param argument="--align" type="boolean" truevalue="--align" falsevalue="" checked="false" label="Produce alignments" help="Align all genes and produce core/pangenome alignments."/>
            <param argument="--rplots" type="boolean" truevalue="--rplots" falsevalue="" checked="false" label="Generate R plots" help="Plot summaries using R."/>
        </section>
    </inputs>

    <outputs>
        <data name="pangenome_summary" format="txt" from_work_dir="results/PIRATE.pangenome_summary.txt" label="${tool.name} on ${on_string}: Pangenome Summary"/>
        <data name="pirate_gene_families" format="tsv" from_work_dir="results/PIRATE.gene_families.ordered.tsv" label="${tool.name} on ${on_string}: Tabular summary of all gene families"/>
        <data name="pirate_unique_alleles" format="tsv" from_work_dir="results/PIRATE.unique_alleles.tsv" label="${tool.name} on ${on_string}: Tabular summary of all unique alleles"/>
        <data name="pirate_presence_absence_fasta" format="fasta" from_work_dir="results/binary_presence_absence.fasta" label="${tool.name} on ${on_string}: Binary Presence/Absence data"/>
        <data name="pirate_presence_absence_nwk" format="newick" from_work_dir="results/binary_presence_absence.nwk" label="${tool.name} on ${on_string}: Binary Presence/Absence Newick data"/>
        <data name="pangenome_gfa" format="gfa1" from_work_dir="results/pangenome.gfa" label="${tool.name} on ${on_string}: Pangenome GFA"/>
        <data name="pirate_rep_sequences_ffn" format="fasta" from_work_dir="results/representative_sequences.ffn" label="${tool.name} on ${on_string}: Representative sequences for each gene family as nucleotide"/>
        <data name="pirate_rep_sequences_faa" format="fasta" from_work_dir="results/representative_sequences.faa" label="${tool.name} on ${on_string}: Representative sequences for each gene family as amino acid"/>

        <data name="pirate_core_aln" format="fasta" from_work_dir="results/core_alignment.fasta" label="${tool.name} on ${on_string}: Core Alignment FASTA">
            <filter>output_opts['align']</filter>
        </data>

        <data name="pirate_core_gff" format="gff" from_work_dir="results/core_alignment.gff" label="${tool.name} on ${on_string}: Core Alignment GFF">
            <filter>output_opts['align']</filter>
        </data>

        <data name="pirate_pangenome_aln" format="fasta" from_work_dir="results/pangenome_alignment.fasta" label="${tool.name} on ${on_string}: Pangenome Alignment FASTA">
            <filter>output_opts['align']</filter>
        </data>

        <data name="pirate_pangenome_gff" format="gff" from_work_dir="results/pangenome_alignment.gff" label="${tool.name} on ${on_string}: Pangenome Alignment GFF">
            <filter>output_opts['align']</filter>
        </data>

        <data name="pirate_plots" format="pdf" from_work_dir="results/PIRATE_plots.pdf" label="${tool.name} on ${on_string}: Summary plots of the PIRATE pangenome">
            <filter>output_opts['rplots']</filter>
        </data>
    </outputs>

    <tests>
        <!-- Test 1 : Default parameters-->
        <test expect_num_outputs="8">
            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
            <section name="global_opts">
                <param name="steps" value="50,60,70,80,90,95,98"/>
                <param name="features" value="CDS"/>
                <param name="min_len" value="120"/>
                <conditional name="pan_opts_conditional">
                    <param name="enable_pan_opt" value="no"/>
                </conditional>
            </section>
            <output name="pangenome_summary" ftype="txt">
                <assert_contents>
                    <has_line line="# 4 gene families in 2 genomes."/>
                    <has_n_lines n="13"/>
                </assert_contents>
            </output>
            <output name="pirate_gene_families" ftype="tsv">
                <assert_contents>
                    <has_n_lines n="5"/>
                </assert_contents>
            </output>
            <output name="pirate_unique_alleles" ftype="tsv">
                <assert_contents>
                    <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_fasta" ftype="fasta">
                <assert_contents>
                    <has_line line=">HO_5096_0412"/>
                    <has_n_lines n="6"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_nwk" ftype="newick">
                <assert_contents>
                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
                    <has_n_lines n="1"/>
                </assert_contents>
            </output>
            <output name="pangenome_gfa" ftype="gfa1">
                <assert_contents>
                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
                    <has_n_lines n="7"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_ffn" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_faa" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
        </test>
        <!-- Test 2 : testing align parameter-->
        <test expect_num_outputs="12">
            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
            <section name="global_opts">
                <param name="steps" value="50,60,70,80,90,95,98"/>
                <param name="features" value="CDS"/>
                <param name="min_len" value="120"/>
                <conditional name="pan_opts_conditional">
                    <param name="enable_pan_opt" value="no"/>
                </conditional>
            </section>
            <section name="output_opts">
                <param name="align" value="true"/>
            </section>
            <output name="pangenome_summary" ftype="txt">
                <assert_contents>
                    <has_line line="# 4 gene families in 2 genomes."/>
                    <has_n_lines n="13"/>
                </assert_contents>
            </output>
            <output name="pirate_gene_families" ftype="tsv">
                <assert_contents>
                    <has_n_lines n="5"/>
                </assert_contents>
            </output>
            <output name="pirate_unique_alleles" ftype="tsv">
                <assert_contents>
                    <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_fasta" ftype="fasta">
                <assert_contents>
                    <has_line line=">HO_5096_0412"/>
                    <has_n_lines n="6"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_nwk" ftype="newick">
                <assert_contents>
                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
                    <has_n_lines n="1"/>
                </assert_contents>
            </output>
            <output name="pangenome_gfa" ftype="gfa1">
                <assert_contents>
                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
                    <has_n_lines n="7"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_ffn" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_faa" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_pangenome_aln" ftype="fasta">
                <assert_contents>
                    <has_line line=">HO_5096_0412"/>
                    <has_n_lines n="4"/>
                </assert_contents>
            </output>
            <output name="pirate_pangenome_gff" ftype="gff">
                <assert_contents>
                    <has_line_matching expression="##sequence-region Pangenome 1 3945"/>
                    <has_n_lines n="6"/>
                </assert_contents>
            </output>
            <output name="pirate_core_aln" ftype="fasta">
                <assert_contents>
                    <has_line line=">HO_5096_0412"/>
                    <has_n_lines n="4"/>
                </assert_contents>
            </output>
            <output name="pirate_core_gff" ftype="gff">
                <assert_contents>
                    <has_line_matching expression="##sequence-region Pangenome 1 2550"/>
                    <has_n_lines n="5"/>
                </assert_contents>
            </output>
        </test>

        <!-- Advanced pangenome options for pan-genome analysis -->
        <test expect_num_outputs="8"> 
            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
            <section name="global_opts">
                <param name="steps" value="50,60,70,80,90,95,98"/>
                <param name="features" value="CDS"/>
                <param name="min_len" value="120"/>
                <conditional name="pan_opts_conditional">
                    <param name="enable_pan_opt" value="yes"/>
                    <param name="perc" value="95"/>
                    <section name="cdhit_opts">
                        <param name="cd_low" value="98"/>
                        <param name="cd_step" value="0.5"/>
                        <param name="cd_core_off" value="true"/>
                    </section>
                    <section name="blast_opts">
                        <param name="evalue" value="0.00001"/>
                        <param name="diamond" value="true"/>
                        <param name="diamond_split" value="true"/>
                        <param name="hsp_len" value="0.1"/>
                    </section>
                    <section name="mcl_opts">
                        <param name="flat" value="2.0"/>
                    </section>
                </conditional>
            </section>
            <section name="output_opts">
                <param name="align" value="false"/>
            </section>
            <output name="pangenome_summary" ftype="txt">
                <assert_contents>
                    <has_line line="# 4 gene families in 2 genomes."/>
                    <has_n_lines n="13"/>
                </assert_contents>
            </output>
            <output name="pirate_gene_families" ftype="tsv">
                <assert_contents>
                    <has_n_lines n="5"/>
                </assert_contents>
            </output>
            <output name="pirate_unique_alleles" ftype="tsv">
                <assert_contents>
                    <has_line_matching expression="g01_09\s+g01\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+\s+MRSA252_00002"/>
                    <has_n_lines n="9"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_fasta" ftype="fasta">
                <assert_contents>
                    <has_line line=">HO_5096_0412"/>
                    <has_n_lines n="6"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_nwk" ftype="newick">
                <assert_contents>
                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
                    <has_n_lines n="1"/>
                </assert_contents>
            </output>
            <output name="pangenome_gfa" ftype="gfa1">
                <assert_contents>
                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
                    <has_n_lines n="7"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_ffn" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_faa" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
        </test>

        <!-- Test 04: Tesing PDF Reports -->
        <test expect_num_outputs="9">
            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
            <section name="global_opts">
                <param name="steps" value="50,60,70,80,90,95,98"/>
                <param name="features" value="CDS"/>
                <param name="min_len" value="120"/>
                <conditional name="pan_opts_conditional">
                    <param name="enable_pan_opt" value="no"/>
                </conditional>
            </section>
            <section name="output_opts">
                <param name="rplots" value="true"/>
            </section>
            <output name="pangenome_summary" ftype="txt">
                <assert_contents>
                    <has_line line="# 4 gene families in 2 genomes."/>
                    <has_n_lines n="13"/>
                </assert_contents>
            </output>
            <output name="pirate_gene_families" ftype="tsv">
                <assert_contents>
                    <has_n_lines n="5"/>
                </assert_contents>
            </output>
            <output name="pirate_unique_alleles" ftype="tsv">
                <assert_contents>
                    <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_fasta" ftype="fasta">
                <assert_contents>
                    <has_line line=">HO_5096_0412"/>
                    <has_n_lines n="6"/>
                </assert_contents>
            </output>
            <output name="pirate_presence_absence_nwk" ftype="newick">
                <assert_contents>
                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
                    <has_n_lines n="1"/>
                </assert_contents>
            </output>
            <output name="pangenome_gfa" ftype="gfa1">
                <assert_contents>
                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
                    <has_n_lines n="7"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_ffn" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_rep_sequences_faa" ftype="fasta">
                <assert_contents>
                    <has_n_lines n="8"/>
                </assert_contents>
            </output>
            <output name="pirate_plots" ftype="pdf">
                <assert_contents>
                    <has_size value="14397" delta="100"/>
                </assert_contents>
            </output>
        </test>

    </tests>
    <help><![CDATA[
**PIRATE** (Pangenome Iterative Refinement and Threshold Evaluation)

PIRATE is a pangenomics tool that allows for the iterative refinement of pangenomes using multiple identity thresholds. It is designed to handle highly divergent pangenomes and identify orthologs across different evolutionary scales.

**INPUTS**

- A collection of gff3 files. 

**CORE OUTPUTS**

1. Pangenome Summary - Summary statistics of gene number and frequency in the pangenome
2. Tabular summary of all gene families - Complete gene family catalog with one row per family. Families split during paralog detection are labeled with underscores and numbers (e.g., g0001_1, g0001_2). Families are ordered by syntenic position in the pangenome graph.
3. Tabular summary of all unique alleles - Catalog of unique alleles per gene family, defined as distinct MCL sub-clusters at higher identity thresholds
4. Binary Presence/Absence data - Binary gene family presence/absence matrix in FASTA format
5. Binary Presence/Absence Newick data - FastTree phylogeny constructed from the binary presence/absence matrix
6. Pangenome GFA - Network representation of gene family connections in GFA format (can be visualized with Bandage)
7. Representative sequences for each gene family as nucleotide - Nucleotide sequences with the longest sequence per family selected as representative (genomes ordered alphabetically)
8. Representative sequences for each gene family as amino acid - Amino acid sequences corresponding to the nucleotide set

**OPTIONAL OUTPUTS**

1. Core Alignment FASTA - MAFFT-aligned core genome sequences, ordered by gene family table. Reverse-translated when created from CDS. Multi-copy genes represented as ? characters.
2. Core Alignment GFF - Annotation coordinates and gene/product information for the core alignment
3. Pangenome Alignment FASTA - MAFFT-aligned full pangenome sequences with the same characteristics as core alignment
4. Pangenome Alignment GFF - Annotation coordinates for the pangenome alignment
5. Summary plots of the PIRATE pangenome - Visualization plots summarizing the pangenome analysis


    ]]></help>
    <expand macro="citations"/>
    <expand macro="creator"/>
</tool>