Mercurial > repos > iuc > coreprofiler_allele_calling

<tool id="coreprofiler_allele_calling" name="CoreProfiler allele_calling" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>
        Calls cgMLST alleles from a bacterial genome using a reference scheme
    </description>
    <macros>
        <import>macro.xml</import>
    </macros>
    <expand macro="biotools"/>
    <expand macro="requirements"/>
    <expand macro="version_command"/>
    <command detect_errors="aggressive"><![CDATA[
#import re

#set $input_files = []
#for $i, $input in enumerate($input_file):
    #if str($input.ext).endswith('.gz'):
        gunzip -c '$input' > input_${i}.fasta &&
        #set $decompressed_file = 'input_' + str($i) + '.fasta'
        #silent $input_files.append($decompressed_file)
    #else:
        #silent $input_files.append(str($input))
    #end if
#end for

#set $scheme_path = str($input_scheme.fields.path) + '/' + str($input_scheme.fields.scheme_path)
#set $db_path = str($input_scheme.fields.path) + '/' + str($input_scheme.fields.db_path)

coreprofiler allele_calling
    --query #echo ' '.join($input_files)
    --scheme_dir '$scheme_path'
    --blast_db_path '$db_path'
    --num_threads "\${GALAXY_SLOTS:-1}"

##Autotag
    --autotag_word_size '$autotag_section.autotag_word_size'

##Scannew
    --min_id_new_allele '$scannew_section.min_id_new_allele'
    --min_cov_new_allele '$scannew_section.min_cov_new_allele'
    --min_cov_incomplete '$scannew_section.min_cov_incomplete'


#if str($scannew_section.detailed) == 'true':
    --detailed
#end if
#if str($scannew_section.cds) == 'true':
    -cds
#end if

#if "profiles_w_tmp_alleles_output" in $scannew_section.output_selection:
    --profiles_w_tmp_alleles profiles_w_tmp_alleles.json
#end if
#if "outfa_output" in $scannew_section.output_selection:
    --outfa '$outfa'
#end if
#if "num_alleles_per_locus_output" in $scannew_section.output_selection:
    --num_alleles_per_locus '$num_alleles_per_locus'
#end if

    -out '$output_file'

    ]]></command>
    <inputs>
        <param name="input_file" type="data" format="fasta,fasta.gz" multiple="true" label="Genome assemblies in FASTA/FASTA.GZ format" help="Provide one or more genome assembly files in FASTA/FASTA.GZ format." />
        <param name="input_scheme" type="select" label="Reference allele scheme" help="Select a pre-installed reference allele scheme. If the desired scheme is not listed, please contact your Galaxy administrator.">
            <options from_data_table="coreprofiler_scheme">
                <validator message="No reference allele scheme is available" type="no_options" />
            </options>
        </param>
        <section name="autotag_section" title="Autotag Parameters" expanded="true">
            <param argument="--autotag_word_size" type="integer" label="Autotag Word Size" help="Word size for autotag BLASTn, defined as 'min length of the shortest allele - 1'. Default is 31." value="31" />
        </section>
        <section name="scannew_section" title="Scannew Parameters" expanded="true">
            <param argument="--min_id_new_allele" type="integer" label="Minimum identity for new alleles" help="Minimum identity percentage to consider an allele as a new one. Default is 90." value="90" />
            <param argument="--min_cov_new_allele" type="integer" label="Minimum coverage for new alleles" help="Minimum coverage percentage to consider an allele as a new one. Default is 90." value="90" />
            <param argument="--min_cov_incomplete" type="integer" label="Minimum coverage for incomplete alleles" help="Minimum coverage percentage to consider an allele as incomplete. Only used with the '--detailed' option. Default is 70." value="70" />
            <param argument="--detailed" type="boolean" checked="true" label="Include detailed information on incomplete alleles" help="If enabled, incomplete alleles will be labeled as 'incomplete' instead of just a '-'. Default is on." truevalue="true" falsevalue="" />
            <param argument="-cds" type="boolean" checked="true" label="Extract new alleles within CDS" help="If enabled, new alleles will only be extracted if they are located within coding sequences (CDS regions). Default is on." truevalue="true" falsevalue="" />
            <param name="output_selection" type="select" display="checkboxes" multiple="true"  label="Supplementary output files">
                <option value="profiles_w_tmp_alleles_output" selected="true">JSON file describing files with temporary alleles</option>
                <option value="outfa_output" selected="true">FASTA file containing sequences of newly detected alleles (if any)</option>
                <option value="num_alleles_per_locus_output" selected="true">TSV file containing the number of alleles per locus of a given scheme</option>
            </param>
        </section>
    </inputs>
    <outputs>
        <data name="output_file" format="tabular" label="${tool.name} on ${on_string}: Allele calling report" />
        <data name="profiles_w_tmp_alleles" format="json" from_work_dir="profiles_w_tmp_alleles.json" label="${tool.name} on ${on_string}: Info about files with temporary alleles" >
            <filter>scannew_section['output_selection'] and "profiles_w_tmp_alleles_output" in scannew_section['output_selection']</filter>
        </data>
        <data name="outfa" format="fasta" label="${tool.name} on ${on_string}: Newly detected alleles" >
            <filter>scannew_section['output_selection'] and "outfa_output" in scannew_section['output_selection']</filter>
        </data>
        <data name="num_alleles_per_locus" format="tsv"  label="${tool.name} on ${on_string}: Number of alleles per locus of a given scheme" >
            <filter>scannew_section['output_selection'] and "num_alleles_per_locus_output" in scannew_section['output_selection']</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="3"> <!-- TEST_1 -->
            <param name="input_file" ftype="fasta" location="https://zenodo.org/records/15854523/files/10_04A025_hennart2022.fas" />
            <param name="input_scheme" value="coreprofiler_downloaded_20250625_klebsiella_3_scgMLST634_632_loci_bigsdb" />
            <section name="autotag_section">
                <param name="autotag_word_size" value="31" />
            </section>
            <section name="scannew_section">
                <param name="min_id_new_allele" value="90"/>
                <param name="min_cov_new_allele" value="90"/>
                <param name="min_cov_incomplete" value="70"/>
                <param name="detailed" value="False"/>
                <param name="cds" value="False"/>
                <param name="output_selection" value="profiles_w_tmp_alleles_output,outfa_output"/>
            </section>
            <output name="output_file" ftype="tabular">
                <assert_contents>
                    <has_text_matching expression="accC_S_6db3bfc0871923e1aedb4998b3c93b37" />
                </assert_contents>
            </output>
            <output name="profiles_w_tmp_alleles" ftype="json">
                <assert_contents>
                    <has_text_matching expression="tmp_loci" />
                </assert_contents>
            </output>
            <output name="outfa" ftype="fasta">
                <assert_contents>
                    <has_text_matching expression="accD_S_2e603343825da09506bdc8ea287f5584" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3"> <!-- TEST_2 with gz fasta file -->
            <param name="input_file" ftype="fasta.gz" location="https://zenodo.org/records/16271909/files/10_04A025_hennart2022.fas.gz" />
            <param name="input_scheme" value="coreprofiler_downloaded_20250625_klebsiella_3_scgMLST634_632_loci_bigsdb" />
            <section name="autotag_section">
                <param name="autotag_word_size" value="31" />
            </section>
            <section name="scannew_section">
                <param name="min_id_new_allele" value="90"/>
                <param name="min_cov_new_allele" value="90"/>
                <param name="min_cov_incomplete" value="70"/>
                <param name="detailed" value="False"/>
                <param name="cds" value="False"/>
                <param name="output_selection" value="profiles_w_tmp_alleles_output,outfa_output"/>
            </section>
            <output name="output_file" ftype="tabular">
                <assert_contents>
                    <has_text_matching expression="accC_S_6db3bfc0871923e1aedb4998b3c93b37" />
                </assert_contents>
            </output>
            <output name="profiles_w_tmp_alleles" ftype="json">
                <assert_contents>
                    <has_text_matching expression="tmp_loci" />
                </assert_contents>
            </output>
            <output name="outfa" ftype="fasta">
                <assert_contents>
                    <has_text_matching expression="accD_S_2e603343825da09506bdc8ea287f5584" />
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="3"> <!-- TEST_3 multiple files with fasta.gz and fasta files-->
            <param name="input_file" location="https://zenodo.org/records/16271909/files/10_04A025_hennart2022.fas.gz,https://zenodo.org/records/15854523/files/10_04A025_hennart2022.fas,https://zenodo.org/records/16271909/files/10_04A025_hennart2022.fas.gz" />
            <param name="input_scheme" value="coreprofiler_downloaded_20250625_klebsiella_3_scgMLST634_632_loci_bigsdb" />
            <section name="autotag_section">
                <param name="autotag_word_size" value="31" />
            </section>
            <section name="scannew_section">
                <param name="min_id_new_allele" value="90"/>
                <param name="min_cov_new_allele" value="90"/>
                <param name="min_cov_incomplete" value="70"/>
                <param name="detailed" value="False"/>
                <param name="cds" value="False"/>
                <param name="output_selection" value="profiles_w_tmp_alleles_output,outfa_output"/>
            </section>
            <output name="output_file" ftype="tabular">
                <assert_contents>
                    <has_text_matching expression="accC_S_6db3bfc0871923e1aedb4998b3c93b37" />
                </assert_contents>
            </output>
            <output name="profiles_w_tmp_alleles" ftype="json">
                <assert_contents>
                    <has_text_matching expression="tmp_loci" />
                </assert_contents>
            </output>
            <output name="outfa" ftype="fasta">
                <assert_contents>
                    <has_text_matching expression="accD_S_2e603343825da09506bdc8ea287f5584" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
CoreProfiler allele_calling
===========================

*CoreProfiler allele_calling* performs cgMLST profiling by identifying alleles present in a bacterial genome assembly based on a reference allele scheme.

It works in two steps:

1. **Autotag** (exact matches on reference database):

This step runs BLASTn with 100% identity, no gaps, and parameters optimized for speed.
The output is parsed, and only results with 100% coverage are kept.

2. **Scannew** (detection of new alleles):

In this step, loci with no exact match from the autotag step are selected. A BLAST database is created for those loci files, and a looser BLASTn search is performed.

The best hit is kept for each locus and classified as follows:
   * If identity > 90% and coverage > 90%, the locus is considered a new allele.
   * If identity > 90% and coverage between 70% and 90%, the allele is present but incomplete ("X").
   * If identity < 90% or coverage < 70%, the allele is considered missing ("-").


Usage
-----

1. Select your genome contigs (in FASTA format).
2. Select a pre-installed reference allele scheme. This requires you to specify the specific organism you are using.
3. Change the parameters (or not) to perform allele matching and detection.

   * **autotag**: Finds exact allele matches using BLASTn with 100% identity and coverage.
   * **scannew**: Detects potential new or incomplete alleles through relaxed BLASTn searches.

4. Run the tool.

Input
-----

Genomes Assembly
````````````````

The input genome assembly must be in FASTA format. This is the sequence that will be scanned for matching alleles.


Reference Database
``````````````````

The reference database is a a pre-installed scheme of known alleles and loci used to identify alleles in the provided genome. Ensure you specify the appropriate reference database for the organism or group under study. If the desired scheme is not listed, please contact your Galaxy administrator.


Output
------

The `allele_calling` command produces a tabular profile listing the identified alleles for each locus, including new or missing alleles when applicable.

Supplementary output files
``````````````````````````

The **scannew** step in CoreProfiler provides three additional output files to offer more detailed insights into the analysis:

   * `--outfa`: FASTA file containing the sequences of new alleles detected during the analysis.
   * `--profiles_w_tmp_alleles`: JSON file providing detailed information about files containing temporary alleles.
   * `--num_alleles_per_locus`: TSV file listing the number of alleles detected for each locus in a given scheme.

    ]]></help>
    <expand macro="citations"/>
</tool>
author	iuc
date	Thu, 18 Dec 2025 15:21:33 +0000
parents	41b933d366e3
children