Mercurial > repos > iuc > ppanggolin_rarefaction

<tool id="ppanggolin_rarefaction" name="PPanGGOLiN rarefaction" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.0">
    <description>computes the rarefaction curve of the pangenome</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="xrefs"/>
    <expand macro="requirements"/>

    <command detect_errors="exit_code"><![CDATA[
        mkdir -p ./tmp_ppanggolin/rarefaction &&
        mkdir -p ./tmp_ppanggolin/tmpdir_rarefaction &&

        ppanggolin rarefaction
        --pangenome '$pangenome_h5'
        --output ./tmp_ppanggolin/rarefaction
        --tmpdir ./tmp_ppanggolin/tmpdir_rarefaction
        --force
        --cpu "\${GALAXY_SLOTS:-4}"
        --disable_prog_bar

        --depth $depth
        --min $min
        --max $max

        #if str($nb_of_partitions) != "":
            --nb_of_partitions $nb_of_partitions
        #end if

        #if "output_rarefaction_csv" in $advanced_pangenome_optional_files:
		&& cat ./tmp_ppanggolin/rarefaction/rarefaction.csv > '${rarefaction_csv}'
        #end if
        #if "output_rarefaction_parameters_csv" in $advanced_pangenome_optional_files:
		&& cat ./tmp_ppanggolin/rarefaction/rarefaction_parameters.csv > '${rarefaction_parameters_csv}'
        #end if
        && cat ./tmp_ppanggolin/rarefaction/rarefaction_curve.html > '${rarefaction_curve_html}'

    ]]></command>

    <inputs>

        <expand macro="inputs_pangenome"/>

        <param argument="--depth" type="integer" value="30" min="1" max="100" label="The number of sampling for each genome" help="Default=30 ; min=1 ; max=100. Warning: if this value is greater than 30, the computation will be VERY intensive and it will take a long time.">
        </param>

        <param argument="--min" type="integer" value="1" min="1" max="499" label="The minimal number of genomes in a sample" help="Default=1 ; min=1 ; max=499. The min value must be lower than the max value.">
        </param>

        <param argument="--max" type="integer" value="100" min="1" max="500" label="The maximal number of genomes in a sample" help="Default=1 ; min=1 ; max=500. The min value must be lower than the max value. Warning: if if this value is greater than 100, the computation will be VERY intensive and it will take a long time.">
        </param>

        <expand macro="inputs_nb_of_partitions"/>

        <param name="advanced_pangenome_optional_files" type="select" label="Add the following output files in the Galaxy history" multiple="true" optional="true" display="checkboxes" >
            <option value="output_rarefaction_csv" selected="true">Rarefaction data in tabular format (csv)</option>
            <option value="output_rarefaction_parameters_csv" selected="true">Rarefaction parameters in tabular format (csv)</option>
        </param>

    </inputs>

    <outputs>

        <data name="rarefaction_csv" format="csv" label="PPanGGOLiN rarefaction on ${on_string}: Rarefaction data (csv)" >
          <filter>advanced_pangenome_optional_files and "output_rarefaction_csv" in advanced_pangenome_optional_files</filter>
        </data>
        <data name="rarefaction_parameters_csv" format="csv" label="PPanGGOLiN rarefaction on ${on_string}: Rarefaction parameters (csv)" >
          <filter>advanced_pangenome_optional_files and "output_rarefaction_parameters_csv" in advanced_pangenome_optional_files</filter>
        </data>

        <data name="rarefaction_curve_html" format="html" label="PPanGGOLiN rarefaction on ${on_string}: Rarefaction curve" />

    </outputs>

    <tests>
        <test expect_num_outputs="3">
            <param name="pangenome_h5" value="h5/test_data.h5" ftype="h5"/>
            <param name="depth" value="30"/>
            <param name="min" value="1"/>
            <param name="max" value="100"/>
            <output name="rarefaction_csv" >
                <assert_contents>
                    <has_size value="4045" delta="100"/>
                </assert_contents>
            </output>
            <output name="rarefaction_parameters_csv" >
                <assert_contents>
                    <has_size value="324" delta="50"/>
                </assert_contents>
            </output>
            <output name="rarefaction_curve_html" >
                <assert_contents>
                    <has_size value="4575692" delta="100"/>
                </assert_contents>
            </output>
        </test>
    </tests>

    <help><![CDATA[

        PPanGGOLiN_ (Gautreau et al. 2020) is a software suite used to create and manipulate prokaryotic pangenomes from a set of either assembled
        genomic DNA sequences or provided genome annotations. PPanGGOLiN builds pangenomes through a graphical model and a statistical method to partition gene
        families in persistent, shell and cloud genomes. It integrates both information on protein-coding genes and their genomic neighborhood to build a graph
        of gene families where each node is a gene family, and each edge is a relation of genetic contiguity.

	The `ppanggolin rarefaction` command generates a rarefaction curve. It represents the evolution of the number of gene families for each partition as you add more genomes to the pangenome. It has been used a lot in the literature as an indicator of the diversity that you are missing with your dataset on your taxonomic group (Tettelin et al., 2005). The idea is that if at some point when you keep adding genomes to your pangenome you do not add any more gene families, you might have access to your entire taxonomic group’s diversity. On the contrary, if you are still adding a lot of genes you may be still missing a lot of gene families.

	There are 8 partitions represented. For each of the partitions, there are multiple representations of the observed data. You can find the observed means, medians, 1st and 3rd quartiles of the number of gene families per number of genome used. You can also find the best fitting of the data by the Heaps’ law, which is usually used to represent this evolution of the diversity in terms of gene families in each of the partitions.

        .. _PPanGGOLiN: https://github.com/labgem/PPanGGOLiN
        .. _documentation: https://ppanggolin.readthedocs.io/en/latest/user/PangenomeAnalyses/pangenomeAnalyses.html#rarefaction-curve

    ]]></help>

    <expand macro="citation"/>

</tool>
author	iuc
date	Mon, 24 Nov 2025 12:54:50 +0000
parents	d848a49b3303
children