Mercurial > repos > iuc > chewbbaca_allelecall

<tool id="chewbbaca_allelecall" name="ChewBBACA AlleleCall" version="@CHEW_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Determine the allelic profiles of a set of genomes</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements" />
    <command detect_errors="exit_code"><![CDATA[
        #import re
        mkdir 'input' &&
        mkdir 'schema' &&
        #for $file in $input_file
        #set escaped_element_identifier = re.sub('[^\w\-]', '_', str($file.element_identifier))
        ln -sf '$file' 'input/${escaped_element_identifier}.${file.ext}' &&
        #end for
        unzip '$input_schema' -d 'schema' &&
        chewBBACA.py AlleleCall
            #if $training_file:
                --ptf '$training_file'
            #end if
            $cds_input
            #if $genes_list:
                --gl '$genes_list'
            #end if
            #if str($blast_score_ratio) != ""
                --bsr $blast_score_ratio
            #end if
            #if str($minimum_length) != ""
            --l $minimum_length
            #end if
            #if str($translation_table) != ""
            --t $translation_table
            #end if
            #if str($size_threshold) != ""
            --st $size_threshold
            #end if
            $no_inferred
            --pm $prodigal_mode
            --mode $mode
            --force-continue
            #if 'output_unclassified' in $output_selector:
                --output-unclassified
            #end if
            #if 'output_missing' in $output_selector:
                --output-missing
            #end if
            #if 'output_novel' in $output_selector:
                --output-novel
            #end if
            #if 'hash_profile' in $output_selector:
            ## It can use any hashing algorithm from hashlib but for simplicity we set it to md5
                --hash-profile md5
            #end if
            -i 'input' -g 'schema/schema_seed/' -o 'output'
    ]]></command>
    <inputs>
        <param format="fasta" name="input_file" type="data" multiple="true" label="Genome assemblies in FASTA format"/>
        <param format="zip" name="input_schema" type="data" label="Schema Files in zip format" help="The schema directory contains the loci FASTA files and a folder named 'short' that contains the FASTA files with the loci representative alleles."/>
        <section name="advanced" title="Advanced options">
            <param argument="--genes-list" type="data" format="txt" label="Gene list" optional="true" />
            <param argument="--training-file" type="data" format="binary" label="Prodigal training file" optional="true" help="By default, gets the training file from the schema"/>
            <param argument="--cds-input" type="boolean" truevalue="--cds-input" falsevalue="" checked="false" label="CDS input" optional="true"/>
            <param argument="--blast-score-ratio" type="float" min="0.0" max="1.0" value="" optional="true" label="BLAST Score Ratio value" />
            <param argument="--minimum-length" type="integer" min="0" value="" optional="true" label="Minimum sequence length value"/>
            <param argument="--translation-table" type="integer" min="0" value="" optional="true" help="Must match the genetic code used to create the training file (default: uses value defined in schema config)." label="Genetic code used to predict genes and to translate coding sequences"/>
            <param argument="--size-threshold" type="float" min="0" value="" optional="true" label="CDS size variation threshold"/>
            <param argument="--no-inferred" type="boolean" truevalue="--no-inferred" falsevalue="" checked="false" optional="true" label="Add the sequences of inferred alleles (INF) to the schema" help="Use this parameter if the schema is being accessed by multiple processes/users simultaneously." />
            <param argument="--prodigal-mode" type="select" optional="true" label="Prodigal Mode" help="&quot;single&quot; for finished genomes, reasonable quality draft genomes and big viruses. &quot;meta&quot; for metagenomes, low quality draft genomes, small viruses, and small plasmids">
                <option value="single" selected="true">
                        single
                    </option>
                    <option value="meta">
                        meta
                    </option>
            </param>
            <param argument="--mode" type="select" label="Execution mode" optional="true">
                <option value="1">Only exact matches at DNA level</option>
                <option value="2">Exact matches at DNA and Protein level </option>
                <option value="3">Exact matches and minimizer-based clustering to find similar alleles based on BSR+0.1 </option>
                <option value="4" selected="true">Exact matches and minimizer-based clustering to find similar alleles based on BSR+0.1 </option>
            </param>
        </section>
        <section name="output" title="Output Options">
            <param name="output_selector" type="select" multiple="true" optional="true" display="checkboxes" label="Select / Deselect all">
                <option value="output_unclassified">Create a Fasta file with unclassified coding sequences. (--output-unclassified)</option>
                <option value="output_missing">Create a Fasta file with coding sequences classified as NIPH, NIPHEM, ASM, ALM, PLOT3, PLOT5 and LOTSC. (--output-missing)</option>
                <option value="output_novel">Create Fasta file with the novel alleles inferred during the allele calling. (--output-novel)</option>
                <option value="hash_profile">Create TSV file with hashed allelic profiles. (--hash-profile) </option>
            </param>
        </section>
    </inputs>
    <outputs>
        <collection name="allelecall_results" type="list" label="${tool.name} on ${on_string}: AlleleCall Results">
            <discover_datasets pattern="(?P&lt;name&gt;.+)\.tsv$" format="tabular" directory="output"/>
        </collection>
        <collection name="allelcall_log" type="list" label="${tool.name} on ${on_string}: AlleleCall Logs">
            <discover_datasets pattern="(?P&lt;name&gt;.+)\.txt$" format="txt" directory="output"/>
        </collection>
        <data name="unclassified_fasta" format="fasta" from_work_dir="output/unclassified_sequences.fasta" label="${tool.name} on ${on_string}: Unclassified fasta">
            <filter>output['output_selector'] and 'output_unclassified' in output['output_selector']</filter>
        </data>
        <data name="missing_fasta" format="fasta" from_work_dir="output/missing_classes.fasta" label="${tool.name} on ${on_string}: Missing fasta">
            <filter>output['output_selector'] and 'output_missing' in output['output_selector']</filter>
        </data>
        <data name="novel_fasta" format="fasta" from_work_dir="output/novel_alleles.fasta" label="${tool.name} on ${on_string}: Novel fasta">
            <filter>output['output_selector'] and 'output_novel' in output['output_selector']</filter>
        </data>
    </outputs>
    <tests>
        <test expect_num_outputs="4">
            <param name="input_file" value="GCA_000007265"/>
            <param name="input_schema" value="schema.zip"/>
            <param name="output_selector" value="output_unclassified,output_missing,hash_profile" />
            <output_collection name="allelecall_results" type="list">
                <element name="cds_coordinates" file="cds_coordinates.tsv" compare="diff"/>
                <element name="loci_summary_stats" file="loci_summary_stats.tsv" compare="diff"/>
                <element name="paralogous_loci" ftype="tabular">
                    <assert_contents>
                        <has_text_matching expression="Genome.*Loci.*CDS"/>
                    </assert_contents>
                </element>
                <element name="results_alleles" ftype="tabular">
                    <assert_contents>
                        <has_text_matching expression="1.*1.*NIPHEM.*1.*1"/>
                        <has_text_matching expression="GCA_000007265.*1"/>
                    </assert_contents>
                </element>
                <element name="results_alleles" file="results_alleles.tsv" compare="diff"/>
                <element name="results_alleles_hashed" ftype="tabular">
                    <assert_contents>
                        <has_text_matching expression="FILE.*GCA-000007265-protein1.*GCA-000007265-protein10.*GCA-000007265-protein100"/>
                        <has_text_matching expression="GCA_000007265.*308e7666834338d0530d925b2737f2c6.*4aece26d201d59a90947e3400c7abf3f.*ebea148832aa2ae2704d37ebd5123169"/>
                    </assert_contents>
                </element>
                <element name="results_statistics" file="results_statistics.tsv" compare="diff"/>
            </output_collection>
            <output_collection name="allelcall_log" type="list">
                <element name="logging_info" ftype="txt">
                    <assert_contents>
                        <has_text_matching expression="Used a BSR of: 0.6"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output name="unclassified_fasta">
                <assert_contents>
                    <has_text_matching expression="GCA_000007265-protein15"/>
                    <has_text_matching expression="ATGCACCACCTGTCACTTCTGCTCCGAAGAGAAAGCCTATCTCTAGGCCGGTCAGAAGGATGTCAAGACCTGGTAAGGTTCTTCGCGTTGCTTCGAATTAAACCACATGCTCCACCGCTTGTGCGGGCCCCCGTCAATTCCTTTGAGTTTCAACCTTGCGGTCGTACTCCCCAGGCGGAGTGCTTAATGCGTTAG"/>
                </assert_contents>
            </output>
            <output name="missing_fasta">
                <assert_contents>
                    <has_text_matching expression="1|GCA_000007265|GCA-000007265-protein16&amp;NIPHEM|GCA_000007265-protein16&amp;EXC"/>
                </assert_contents>
            </output>
        </test>
        <test expect_num_outputs="4">
            <param name="input_file" value="GCA_000007265.fna"/>
            <param name="input_schema" value="schema.zip"/>
            <param name="output_selector" value="output_unclassified,output_missing,hash_profile" />
            <output_collection name="allelecall_results" type="list">
                <element name="paralogous_loci" ftype="tabular">
                    <assert_contents>
                        <has_text_matching expression="Genome.*Loci.*CDS"/>
                    </assert_contents>
                </element>
                <element name="results_alleles" ftype="tabular">
                    <assert_contents>
                        <has_text_matching expression="1.*1.*NIPHEM.*1.*1"/>
                        <has_text_matching expression="GCA_000007265.*1"/>
                    </assert_contents>
                </element>
                <element name="results_alleles_hashed" ftype="tabular">
                    <assert_contents>
                        <has_text_matching expression="FILE.*GCA-000007265-protein1.*GCA-000007265-protein10.*GCA-000007265-protein100"/>
                        <has_text_matching expression="GCA_000007265_fna.*308e7666834338d0530d925b2737f2c6.*4aece26d201d59a90947e3400c7abf3f.*ebea148832aa2ae2704d37ebd5123169"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output_collection name="allelcall_log" type="list">
                <element name="logging_info" ftype="txt">
                    <assert_contents>
                        <has_text_matching expression="Used a BSR of: 0.6"/>
                    </assert_contents>
                </element>
            </output_collection>
            <output name="unclassified_fasta">
                <assert_contents>
                    <has_text_matching expression="GCA_000007265_fna-protein83"/>
                    <has_text_matching expression="ATGCACCACCTGTCACTTCTGCTCCGAAGAGAAAGCCTATCTCTAGGCCGGTCAGAAGGATGTCAAGACCTGGTAAGGTTCTTCGCGTTGCTTCGAATTAAACCACATGCTCCACCGCTTGTGCGGGCCCCCGTCAATTCCTTTGAGTTTCAACCTTGCGGTCGTACTCCCCAGGCGGAGTGCTTAATGCGTTAG"/>
                </assert_contents>
            </output>
            <output name="missing_fasta">
                <assert_contents>
                    <has_text_matching expression="1|GCA_000007265|GCA-000007265-protein16&amp;NIPHEM|GCA_000007265-protein16&amp;EXC"/>
                </assert_contents>
            </output>
        </test>
    </tests>
    <help>
chewBBACA is a software suite for the creation and evaluation of core genome and whole genome MultiLocus Sequence Typing (cg/wgMLST) schemas and results.

In chewBBACA, by default, an allele needs to be a CDS defined by Prodigal_. To ensure reproducibility of the CDS prediction, the same Prodigal training file for each bacterial species should be used and provided as input.

.. class:: infomark

**Important**

Although the use of a training file is optional, it is highly recommended to ensure consistent results.

If the schema files are created by chewBBACA v2, please use the PrepExternalSchema module to convert the schema to a format fully compatible with chewBBACA v3.

By default, the AlleleCall module uses the Prodigal training file included in the schema’s directory and it is not necessary to pass a training file to the --ptf parameter.

.. class:: infomark

**Note**

If a text file that contains a list of full paths to loci FASTA files or loci IDs, one per line, is passed to the --genes-list parameter, the process will only perform allele calling for the loci in that list.

.. _Prodigal: https://github.com/hyattpd/Prodigal
    </help>
    <expand macro="citations" />
</tool>