Mercurial > repos > iuc > psauron

<tool id="psauron" name="Psauron" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
    <description>Machine learning model for rapid assessment of protein coding gene annotation</description>
    <macros>
        <import>macros.xml</import>
    </macros>
    <expand macro="requirements"/>
    <version_command>psauron --version</version_command>
    <command detect_errors="exit_code">

        <![CDATA[
        psauron
        -i '$i'
        -c
        --minimum-length '$minimum_length'
        --exclude '$exclude'
        --inframe '$inframe'
        --outframe '$outframe'
        $s
        $p
        -o '$psauron_score_file'
        ]]>
    </command>
    <inputs>
        <param argument="-i" type="data" format="fasta" label="Input fasta" help="FASTA with spliced CDS sequence or protein sequence. A spliced CDS fasta can be created from a GTF/GFF and a reference FASTA by using gffread."/>
        <param argument="--minimum-length" type="integer" min="0" value="5" optional="true" label="Exclude all proteins shorter than m amino acids"/>
        <param argument="--exclude" type="text" value="None" optional="true" label="Exclude any CDS where FASTA description contains given text (case invariant), e.g. hypothetical"/>
        <param argument="--inframe" type="float" min="0" max="1" value="0.5" optional="true" label="Probability threshold used to determine final psauron score, in-frame, higher number decreases sensitivity and increases specificity, default=0.5"/>
        <param argument="--outframe" type="float" min="0" max="1" value="0.5" optional="true" label="Probability threshold used to determine final psauron score, out-of-frame, higher number increases sensitivity and decreases specificity, default=0.5"/>
        <!--c option not included-->
        <param argument="-s" type="boolean" label="Score only the in-frame CDS, which may lower accuracy of the model" truevalue="-s" falsevalue="" checked="false" />
        <param argument="-p" type="boolean" label="Set to Yes if your FASTA contains amino acid protein sequence, which may lower accuracy of the model" truevalue="-p" falsevalue="" checked="false" />
        <!--v option not included-->
    </inputs>
    <outputs>
        <data name="psauron_score_file" format="csv" label="${tool.name} on ${on_string}: csv with scores for all reading frames"></data>
     </outputs>
    <tests>
        <!-- TEST 1 : nucleotide coding sequences (CDS) -->
        <test expect_num_outputs="1">
            <param name="i" value="seq_test_CDS.fa" ftype="fasta"/>
            <output name="psauron_score_file" ftype="csv">
                <assert_contents>
                    <has_text_matching expression="psauron score" />
                </assert_contents>
            </output>
        </test>
        <!-- TEST 2 : protein (amino acid) sequence -->
        <test expect_num_outputs="1">
            <param name="i" value="seq_test_protein.faa" ftype="fasta"/>
            <param name="p" value="true"/>
            <output name="psauron_score_file" ftype="csv">
                <assert_contents>
                    <has_text_matching expression="psauron score" />
                </assert_contents>
            </output>
        </test>
    </tests>
    <help><![CDATA[
.. class:: infomark

**PSAURON**

PSAURON is a machine learning model for rapid assessment of protein coding gene annotation.

PSAURON (Protein Sequence Assessment Using a Reference ORF Network), is a software tool developed to help assess the quality of protein-coding gene annotations. Utilizing a machine learning model trained on a diverse dataset from over 1000 plant and animal genomes, PSAURON assigns a score to coding DNA or protein sequence that reflects the likelihood that the sequence is a genuine protein-coding region.

PSAURON scores can be used for genome-wide protein annotation assessment as well as the rapid identification of potentially spurious annotated proteins. Validation against established benchmarks demonstrates PSAURON’s effectiveness and correlation with recognized measures of protein quality, highlighting its potential use as a widely applicable method to evaluate precision in gene annotation.

]]></help>
    <citations>
        <citation type="doi">10.1093/nargab/lqae189</citation>
    </citations>
</tool>