Mercurial > repos > iuc > pirate

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Wed Feb 04 11:52:07 2026 +0000
@@ -0,0 +1,25 @@
+<macros>
+    <token name="@TOOL_VERSION@">1.0.5</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">25.0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">pirate</requirement>
+        </requirements>
+    </xml>
+    <xrefs>
+        <xref type="bio.tools">PIRATE</xref>
+    </xrefs>
+    <xml name="creator">
+        <creator>
+            <person givenName="Saim" familyName="Momin" url="https://github.com/SaimMomin12"/>
+            <person givenName="Fernando" familyName="Martin Garcia" url="https://github.com/FMG0411"/>
+            <organization name="Galaxy Europe"/>
+        </creator>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.1093/gigascience/giz119</citation>
+        </citations>
+    </xml>
+</macros>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pirate.xml	Wed Feb 04 11:52:07 2026 +0000
@@ -0,0 +1,441 @@
+<tool id="pirate" name="PIRATE" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Pangenome Iterative Refinement and Threshold Evaluation</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <command detect_errors="exit_code"><![CDATA[
+        ## Create input directory and symlink GFFs
+        #import re
+        mkdir input_gffs &&
+        #for $f in $input_files:
+            #set identifier = re.sub('[^\s\w\-\\.]','_',str($f.element_identifier))
+            ln -s '$f' 'input_gffs/$identifier' &&
+        #end for
+
+        PIRATE
+            --input input_gffs/
+            --steps '$steps'
+            --features '$features'
+            --min-len $min_len
+            $nucl
+            $para_off
+            $classify_off
+            $align
+            $rplots
+            --threads "\${GALAXY_SLOTS:-8}"
+            ## Pan-opt and advanced parameters
+            #if str($global_opts.pan_opts_conditional.enable_pan_opt) == 'yes':
+                --pan-opt '
+                --perc $global_opts.pan_opts_conditional.perc
+                --cd-low $global_opts.pan_opts_conditional.cdhit_opts.cd_low
+                --cd-step $global_opts.pan_opts_conditional.cdhit_opts.cd_step
+                $global_opts.pan_opts_conditional.cdhit_opts.cd_core_off
+                --evalue $global_opts.pan_opts_conditional.blast_opts.evalue
+                $global_opts.pan_opts_conditional.blast_opts.diamond
+                $global_opts.pan_opts_conditional.blast_opts.diamond_split
+                --hsp-len $global_opts.pan_opts_conditional.blast_opts.hsp_len
+                --flat $global_opts.pan_opts_conditional.mcl_opts.flat
+                '
+            #end if
+            --output results/
+    ]]></command>
+    <inputs>
+        <param argument="--input" name="input_files" type="data" format="gff,gff3" multiple="true" label="Input GFF files" help="Select all GFF files for pangenome construction"/>
+
+        <section name="global_opts" title="Global Options" expanded="true">
+            <param argument="--steps" type="text" value="50,60,70,80,90,95,98" label="Identity thresholds" help="Comma-separated list of % identity thresholds"/>
+            <param argument="--features" type="text" value="CDS" label="Features" help="Features to use (e.g., CDS, tRNA). Multiple could be specified using commas."/>
+            <param argument="--min-len" type="integer" value="120" min="0" label="Minimum feature length"/>
+            <param argument="--nucl" type="boolean" truevalue="--nucl" falsevalue="" checked="false" label="Do not translate to Amino Acids" />
+
+            <conditional name="pan_opts_conditional">
+                <param name="enable_pan_opt" type="select" label="Enable advanced pangenome options" help="Enable --pan-opt and advanced pangenome parameters">
+                    <option value="no" selected="true">No</option>
+                    <option value="yes">Yes</option>
+                </param>
+                <when value="yes">
+                    <param argument="--perc" type="integer" value="98" optional="true" min="0" max="100" label="Single % identity threshold to use for pangenome" help="Single % identity threshold to use for pangenome"/>
+
+                    <section name="cdhit_opts" title="CD-HIT Options" expanded="false">
+                        <param argument="--cd-low" type="integer" value="98" min="0" max="100" label="CD-HIT lowest percentage id" help="Default: 98"/>
+                        <param argument="--cd-step" type="float" value="0.5" min="0" label="CD-HIT step size" help="Default: 0.5"/>
+                        <param argument="--cd-core-off" type="boolean" truevalue="--cd-core-off" falsevalue="" checked="false" label="Don't extract core families during CD-HIT clustering" help="Default: Extract core families"/>
+                    </section>
+
+                    <section name="blast_opts" title="BLAST Options" expanded="false">
+                        <param argument="--evalue" type="float" value="1E-6" min="0" label="E-value for BLAST hit filtering" help="Default: 1E-6"/>
+                        <param argument="--diamond" type="boolean" truevalue="--diamond" falsevalue="" checked="false" label="Use DIAMOND instead of BLAST. Incompatible with --nucl"/>
+                        <param argument="--diamond-split" type="boolean" truevalue="--diamond-split" falsevalue="" checked="false" label="Split DIAMOND files into batches"/>
+                        <param argument="--hsp-len" type="float" value="0" min="0" max="1" label="Remove BLAST HSPs proportion threshold" help="Remove BLAST HSPs that are less than this proportion of query length."/>
+                    </section>
+
+                    <section name="mcl_opts" title="MCL Options" expanded="false">
+                        <param argument="--flat" type="float" value="1.5" min="0" label="MCL inflation value"/>
+                    </section>
+                </when>
+                <when value="no"/>
+            </conditional>
+        </section>
+
+        <section name="para_opts" title="Paralog Classification" expanded="false">
+            <param argument="--para-off" type="boolean" truevalue="--para-off" falsevalue="" checked="false" label="Switch off paralog identification"/>
+            <param argument="--classify-off" type="boolean" truevalue="--classify-off" falsevalue="" checked="false" label="Do not classify paralogs"/>
+        </section>
+
+        <section name="output_opts" title="Output Options" expanded="true">
+            <param argument="--align" type="boolean" truevalue="--align" falsevalue="" checked="false" label="Produce alignments" help="Align all genes and produce core/pangenome alignments."/>
+            <param argument="--rplots" type="boolean" truevalue="--rplots" falsevalue="" checked="false" label="Generate R plots" help="Plot summaries using R."/>
+        </section>
+    </inputs>
+
+    <outputs>
+        <data name="pangenome_summary" format="txt" from_work_dir="results/PIRATE.pangenome_summary.txt" label="${tool.name} on ${on_string}: Pangenome Summary"/>
+        <data name="pirate_gene_families" format="tsv" from_work_dir="results/PIRATE.gene_families.ordered.tsv" label="${tool.name} on ${on_string}: Tabular summary of all gene families"/>
+        <data name="pirate_unique_alleles" format="tsv" from_work_dir="results/PIRATE.unique_alleles.tsv" label="${tool.name} on ${on_string}: Tabular summary of all unique alleles"/>
+        <data name="pirate_presence_absence_fasta" format="fasta" from_work_dir="results/binary_presence_absence.fasta" label="${tool.name} on ${on_string}: Binary Presence/Absence data"/>
+        <data name="pirate_presence_absence_nwk" format="newick" from_work_dir="results/binary_presence_absence.nwk" label="${tool.name} on ${on_string}: Binary Presence/Absence Newick data"/>
+        <data name="pangenome_gfa" format="gfa1" from_work_dir="results/pangenome.gfa" label="${tool.name} on ${on_string}: Pangenome GFA"/>
+        <data name="pirate_rep_sequences_ffn" format="fasta" from_work_dir="results/representative_sequences.ffn" label="${tool.name} on ${on_string}: Representative sequences for each gene family as nucleotide"/>
+        <data name="pirate_rep_sequences_faa" format="fasta" from_work_dir="results/representative_sequences.faa" label="${tool.name} on ${on_string}: Representative sequences for each gene family as amino acid"/>
+
+        <data name="pirate_core_aln" format="fasta" from_work_dir="results/core_alignment.fasta" label="${tool.name} on ${on_string}: Core Alignment FASTA">
+            <filter>output_opts['align']</filter>
+        </data>
+
+        <data name="pirate_core_gff" format="gff" from_work_dir="results/core_alignment.gff" label="${tool.name} on ${on_string}: Core Alignment GFF">
+            <filter>output_opts['align']</filter>
+        </data>
+
+        <data name="pirate_pangenome_aln" format="fasta" from_work_dir="results/pangenome_alignment.fasta" label="${tool.name} on ${on_string}: Pangenome Alignment FASTA">
+            <filter>output_opts['align']</filter>
+        </data>
+
+        <data name="pirate_pangenome_gff" format="gff" from_work_dir="results/pangenome_alignment.gff" label="${tool.name} on ${on_string}: Pangenome Alignment GFF">
+            <filter>output_opts['align']</filter>
+        </data>
+
+        <data name="pirate_plots" format="pdf" from_work_dir="results/PIRATE_plots.pdf" label="${tool.name} on ${on_string}: Summary plots of the PIRATE pangenome">
+            <filter>output_opts['rplots']</filter>
+        </data>
+    </outputs>
+
+    <tests>
+        <!-- Test 1 : Default parameters-->
+        <test expect_num_outputs="8">
+            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
+            <section name="global_opts">
+                <param name="steps" value="50,60,70,80,90,95,98"/>
+                <param name="features" value="CDS"/>
+                <param name="min_len" value="120"/>
+                <conditional name="pan_opts_conditional">
+                    <param name="enable_pan_opt" value="no"/>
+                </conditional>
+            </section>
+            <output name="pangenome_summary" ftype="txt">
+                <assert_contents>
+                    <has_line line="# 4 gene families in 2 genomes."/>
+                    <has_n_lines n="13"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_gene_families" ftype="tsv">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_unique_alleles" ftype="tsv">
+                <assert_contents>
+                    <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_line line=">HO_5096_0412"/>
+                    <has_n_lines n="6"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_nwk" ftype="newick">
+                <assert_contents>
+                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
+                    <has_n_lines n="1"/>
+                </assert_contents>
+            </output>
+            <output name="pangenome_gfa" ftype="gfa1">
+                <assert_contents>
+                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
+                    <has_n_lines n="7"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_faa" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+        </test>
+        <!-- Test 2 : testing align parameter-->
+        <test expect_num_outputs="12">
+            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
+            <section name="global_opts">
+                <param name="steps" value="50,60,70,80,90,95,98"/>
+                <param name="features" value="CDS"/>
+                <param name="min_len" value="120"/>
+                <conditional name="pan_opts_conditional">
+                    <param name="enable_pan_opt" value="no"/>
+                </conditional>
+            </section>
+            <section name="output_opts">
+                <param name="align" value="true"/>
+            </section>
+            <output name="pangenome_summary" ftype="txt">
+                <assert_contents>
+                    <has_line line="# 4 gene families in 2 genomes."/>
+                    <has_n_lines n="13"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_gene_families" ftype="tsv">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_unique_alleles" ftype="tsv">
+                <assert_contents>
+                    <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_line line=">HO_5096_0412"/>
+                    <has_n_lines n="6"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_nwk" ftype="newick">
+                <assert_contents>
+                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
+                    <has_n_lines n="1"/>
+                </assert_contents>
+            </output>
+            <output name="pangenome_gfa" ftype="gfa1">
+                <assert_contents>
+                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
+                    <has_n_lines n="7"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_faa" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_pangenome_aln" ftype="fasta">
+                <assert_contents>
+                    <has_line line=">HO_5096_0412"/>
+                    <has_n_lines n="4"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_pangenome_gff" ftype="gff">
+                <assert_contents>
+                    <has_line_matching expression="##sequence-region Pangenome 1 3945"/>
+                    <has_n_lines n="6"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_core_aln" ftype="fasta">
+                <assert_contents>
+                    <has_line line=">HO_5096_0412"/>
+                    <has_n_lines n="4"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_core_gff" ftype="gff">
+                <assert_contents>
+                    <has_line_matching expression="##sequence-region Pangenome 1 2550"/>
+                    <has_n_lines n="5"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- Advanced pangenome options for pan-genome analysis -->
+        <test expect_num_outputs="8">
+            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
+            <section name="global_opts">
+                <param name="steps" value="50,60,70,80,90,95,98"/>
+                <param name="features" value="CDS"/>
+                <param name="min_len" value="120"/>
+                <conditional name="pan_opts_conditional">
+                    <param name="enable_pan_opt" value="yes"/>
+                    <param name="perc" value="95"/>
+                    <section name="cdhit_opts">
+                        <param name="cd_low" value="98"/>
+                        <param name="cd_step" value="0.5"/>
+                        <param name="cd_core_off" value="true"/>
+                    </section>
+                    <section name="blast_opts">
+                        <param name="evalue" value="0.00001"/>
+                        <param name="diamond" value="true"/>
+                        <param name="diamond_split" value="true"/>
+                        <param name="hsp_len" value="0.1"/>
+                    </section>
+                    <section name="mcl_opts">
+                        <param name="flat" value="2.0"/>
+                    </section>
+                </conditional>
+            </section>
+            <section name="output_opts">
+                <param name="align" value="false"/>
+            </section>
+            <output name="pangenome_summary" ftype="txt">
+                <assert_contents>
+                    <has_line line="# 4 gene families in 2 genomes."/>
+                    <has_n_lines n="13"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_gene_families" ftype="tsv">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_unique_alleles" ftype="tsv">
+                <assert_contents>
+                    <has_line_matching expression="g01_09\s+g01\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+\s+MRSA252_00002"/>
+                    <has_n_lines n="9"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_line line=">HO_5096_0412"/>
+                    <has_n_lines n="6"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_nwk" ftype="newick">
+                <assert_contents>
+                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
+                    <has_n_lines n="1"/>
+                </assert_contents>
+            </output>
+            <output name="pangenome_gfa" ftype="gfa1">
+                <assert_contents>
+                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
+                    <has_n_lines n="7"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_faa" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+        </test>
+
+        <!-- Test 04: Tesing PDF Reports -->
+        <test expect_num_outputs="9">
+            <param name="input_files" location="https://zenodo.org/records/18470711/files/HO_5096_0412.gff,https://zenodo.org/records/18470711/files/MRSA252.gff"/>
+            <section name="global_opts">
+                <param name="steps" value="50,60,70,80,90,95,98"/>
+                <param name="features" value="CDS"/>
+                <param name="min_len" value="120"/>
+                <conditional name="pan_opts_conditional">
+                    <param name="enable_pan_opt" value="no"/>
+                </conditional>
+            </section>
+            <section name="output_opts">
+                <param name="rplots" value="true"/>
+            </section>
+            <output name="pangenome_summary" ftype="txt">
+                <assert_contents>
+                    <has_line line="# 4 gene families in 2 genomes."/>
+                    <has_n_lines n="13"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_gene_families" ftype="tsv">
+                <assert_contents>
+                    <has_n_lines n="5"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_unique_alleles" ftype="tsv">
+                <assert_contents>
+                    <has_line_matching expression="g03_10\s+g03\s+trpD\s+Anthranilate phosphoribosyltransferase\s+98\s+3\s+1\s+1\s+1\s+1\s+0\s+0\s+0\s+0\s+1\s+Anthranilate phosphoribosyltransferase\(1\)\s+trpD\(1\)\s+243\s+243\s+243\.00\s+MRSA252_00002"/>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_fasta" ftype="fasta">
+                <assert_contents>
+                    <has_line line=">HO_5096_0412"/>
+                    <has_n_lines n="6"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_presence_absence_nwk" ftype="newick">
+                <assert_contents>
+                    <has_line line="(HO_5096_0412:0.152049416,MRSA252:0.152049416);"/>
+                    <has_n_lines n="1"/>
+                </assert_contents>
+            </output>
+            <output name="pangenome_gfa" ftype="gfa1">
+                <assert_contents>
+                    <has_line_matching expression="S\tg01\tA\tRC:i:2"/>
+                    <has_n_lines n="7"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_ffn" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_rep_sequences_faa" ftype="fasta">
+                <assert_contents>
+                    <has_n_lines n="8"/>
+                </assert_contents>
+            </output>
+            <output name="pirate_plots" ftype="pdf">
+                <assert_contents>
+                    <has_size value="14397" delta="100"/>
+                </assert_contents>
+            </output>
+        </test>
+
+    </tests>
+    <help><![CDATA[
+**PIRATE** (Pangenome Iterative Refinement and Threshold Evaluation)
+
+PIRATE is a pangenomics tool that allows for the iterative refinement of pangenomes using multiple identity thresholds. It is designed to handle highly divergent pangenomes and identify orthologs across different evolutionary scales.
+
+**INPUTS**
+
+- A collection of gff3 files.
+
+**CORE OUTPUTS**
+
+1. Pangenome Summary - Summary statistics of gene number and frequency in the pangenome
+2. Tabular summary of all gene families - Complete gene family catalog with one row per family. Families split during paralog detection are labeled with underscores and numbers (e.g., g0001_1, g0001_2). Families are ordered by syntenic position in the pangenome graph.
+3. Tabular summary of all unique alleles - Catalog of unique alleles per gene family, defined as distinct MCL sub-clusters at higher identity thresholds
+4. Binary Presence/Absence data - Binary gene family presence/absence matrix in FASTA format
+5. Binary Presence/Absence Newick data - FastTree phylogeny constructed from the binary presence/absence matrix
+6. Pangenome GFA - Network representation of gene family connections in GFA format (can be visualized with Bandage)
+7. Representative sequences for each gene family as nucleotide - Nucleotide sequences with the longest sequence per family selected as representative (genomes ordered alphabetically)
+8. Representative sequences for each gene family as amino acid - Amino acid sequences corresponding to the nucleotide set
+
+**OPTIONAL OUTPUTS**
+
+1. Core Alignment FASTA - MAFFT-aligned core genome sequences, ordered by gene family table. Reverse-translated when created from CDS. Multi-copy genes represented as ? characters.
+2. Core Alignment GFF - Annotation coordinates and gene/product information for the core alignment
+3. Pangenome Alignment FASTA - MAFFT-aligned full pangenome sequences with the same characteristics as core alignment
+4. Pangenome Alignment GFF - Annotation coordinates for the pangenome alignment
+5. Summary plots of the PIRATE pangenome - Visualization plots summarizing the pangenome analysis
+
+
+    ]]></help>
+    <expand macro="citations"/>
+    <expand macro="creator"/>
+</tool>
\ No newline at end of file