Mercurial > repos > bebatut > sortmerna

<tool id="sortmerna" name="SortMeRNA" version="0.1.0">
    <description>to filter ribosomal RNAs in metatranscriptomic data</description>

    <requirements>
        <requirement type="package" version="2.0">sortmerna</requirement>
    </requirements>

    <stdio>
        <exit_code range="1:" />
    </stdio>

    <version_command>
<![CDATA[
sortmerna --version 2>&1|grep 'SortMeRNA version'
]]>
    </version_command>

    <command>
<![CDATA[
    #set $ref = ''
    #set $sep=''

    #if str( $databases.databases_selector ) == 'history'
        #for $db in $databases.databases_name
            #set $ref += $sep + str($db) + ',' + $os.path.splitext($os.path.basename(str($db)))[0]
            #set $sep = ':'
        #end for
    #else
        ## databases path is not directly accessible, must match by hand with LOC file contents
        #set $data_table = dict([(_[0], _[2]) for _ in $databases.databases_input.input.options.tool_data_table.data])
        #for $db in $databases.databases_input.value
            #set $ref += $sep + $data_table[$db] + ',' + $os.path.splitext($data_table[$db])[0]
            #set $sep = ':'
        #end for
    #end if

    indexdb_rna --ref $ref -L $seed_length --max_pos $max_pos

    &&

    sortmerna
        --ref $ref
        --reads $input_sequence_file
        --aligned aligned

        $fastx.fastx_test
        #if $fastx.fastx_test == '--fastx'
            #if $fastx.fastx_rejected
                --other other_file
            #end if
        #end if

        $sam.sam_test
        #if $sam.sam_test == '--sam'
            $sam.sam_sq_tag
        #end if

        $blast_format

        $log

        #if $report.report_type == 'best'
            #if $report.report_best.report_best_type == '0'
                --best 0
            #else if $report.report_best.report_best_type == '1'
                --best 1
                --min_lis $report.report_best.report_best_min_lis
            #else
                --best $report.report_best.report_best_value
                --min_list $report.report_best.report_best_min_lis
            #end if
        #else
            #if $report.report_num_alignments.report_num_alignments_type == '0'
                --num_alignments 0
            #else if $report.report_num_alignments.report_num_alignments_type == '1'
                --num_alignments 1
            #else
                --num_alignments $report.report_num_alignments.report_num_alignments_value
            #end if
        #end if

        -e $e_value
        --match $match
        --mismatch $mismatch
        --gap_open $gap_open
        --gap_ext $gap_ext
        -N $ambiguous_letter

        #if $strand == 'forward'
            -F
        #end if
        #if $strand == 'reverse'
            -R
        #end if
]]>
    </command>

    <inputs>
        <param name="input_sequence_file" type="data" format="fastq,fasta" label="Input sequence file" help=""/>

        <conditional name="databases">
            <param name="databases_selector" type="select" label="Databases to query" help="">
                <option value="cached" selected="true">Public ribosomal databases</option>
                <option value="history">Databases from your history</option>
            </param>
            <when value="cached">
                <param name="databases_input" label="rRNA databases" type="select" display="checkboxes" multiple="true">
                    <options from_data_table="sortmerna_rRNA_databases" />
                    <validator type="no_options" message="Select at least one database"/>
                </param>
            </when>
            <when value="history">
                <param name="databases_name" type="data" format="fasta" multiple="true" label="rRNA databases"
                    help=""/>
            </when>
        </conditional>

        <conditional name="fastx">
            <param name="fastx_test" type='select' label="Output into Fasta/FastQ file?" help="">
                <option value="--fastx">Yes</option>
                <option value="">No</option>
            </param>
            <when value="--fastx">
                <param name='fastx_rejected' type='boolean' checked="true" label="Conserve rejected reads?" help=""/>
            </when>
        </conditional>

        <conditional name="sam">
            <param name="sam_test" type='select' label="Output SAM alignments?" help="">
                <option value="--sam">Yes</option>
                <option value="">No</option>
            </param>
            <when value="--sam">
                <param name='sam_sq_tag' type='boolean' checked="true" truevalue="--SQ" falsevalue="" label="Add SQ tags to SAM file?" help=""/>
            </when>
        </conditional>

        <param name="blast_format" type="select" display="radio" label="Format for BLAST output" help="">
            <option value="--blast 0">Pairwise</option>
            <option value="--blast 1">Tabular (Blast -m 8 format)</option>
            <option value="--blast 2'">Tabular + column for CIGAR</option>
            <option value="--blast 3" selected="true">Tabular + columns for CIGAR and query coverage</option>
            <option value="">No Blast output</option>
        </param>

        <param name='log' type='boolean' checked="true" truevalue="--log" falsevalue="" label="Conserve overall statistic output into a log file?" help=""/>

        <conditional name="report">
            <param name="report_type" type="select" display="radio" label="Parameters for filtering and read mapping" help="">
                <option value="best" selected="true">Report best alignments per read reaching E-value</option>
                <option value="num_alignments">Report first alignements per read reaching E-value</option>
            </param>
            <when value="best">
                <conditional name="report_best">
                    <param name="report_best_type" type="select" display="radio" label="Number of searched alignments" help="Only the best alignment is reported">
                        <option value="0">All high-candidate reference sequences are searched for alignments (very slow)</option>
                        <option value="1" selected="true">Only one high-candidate reference sequence is searched for alignments (fast). The high-candidate sequences are determined heuristically using a LIS of seed matches)</option>
                        <option value="other_value">A custom number of reference sequences are searched for alignments (speed decrease for high value)</option>
                    </param>
                    <when value="other_value">
                        <param name="report_best_value" type="integer" min="0" max="100" value="1" label="Number of alignments to be made" help="Only the best one is reported. The computation speed decrease with high value"/>
                        <param name="report_best_min_lis" type="integer" min="0" max="100" value="2" label="Number of longest LIS an alignement needs to be searched" help="The alignements having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is computed using seeds' positions to expand hits into longer matches prior to Smith-Waterman alignment."/>
                    </when>
                    <when value="1">
                        <param name="report_best_min_lis" type="integer" min="0" max="100" value="2" label="Number of longest LIS an alignement needs to be searched" help="The alignements having the first INT longest LIS. LIS stands for Longest Increasing Subsequence, it is computed using seeds' positions to expand hits into longer matches prior to Smith-Waterman alignment."/>
                    </when>
                </conditional>
            </when>
            <when value="num_alignments">
                <conditional name="report_num_alignments">
                    <param name="report_num_alignments_type" type="select" display="radio" label="Number of output alignments" help="">
                        <option value="0">All alignments reaching the E-value threshold are reported (very slow, this option is not suggested for high similarity rRNA databases)</option>
                        <option value="1" selected="true">The first alignment passing E-value threshold are reported (very fast, best choice if only filtering is needed)</option>
                        <option value="other_value">A custom number of alignments are made and reported (speed decrease for high value)</option>
                    </param>
                    <when value="other_value">
                        <param name="report_num_alignments_value" type="integer" min="0" max="100" value="1" label="Number of alignments to be made and reported" help=""/>
                    </when>
                </conditional>
            </when>
        </conditional>

        <param name="e_value" type="float" min="0" max="10" value="1" label="E-value threshold" help=""/>
        <param name="match" type="integer" min="0" max="10" value="2" label="SW score for a match" help=""/>
        <param name="mismatch" type="integer" min="-10" max="0" value="-3" label="SW penalty for a mismatch" help=""/>
        <param name="gap_open" type="integer" min="0" max="10" value="5" label="SW penalty for introducing a gap" help=""/>
        <param name="gap_ext" type="integer" min="0" max="10" value="2" label="SW penalty for extending a gap" help=""/>
        <param name="ambiguous_letter" type="integer" min="-10" max="0" value="-3" label="SW penalty for ambiguous letters (N's)" help=""/>

        <param name="strand" type="select" display="radio" label="Search on" help="">
            <option value="both" selected="true">Both strands</option>
            <option value="forward" >Only forward strand</option>
            <option value="reverse" >Only reverse-complementary strand</option>
        </param>

        <param name="seed_length" type="integer" min="0" max="100" value="18" label="Seed length for database indexing" help=""/>
        <param name="max_pos" type="integer" min="0" max="100000" value="10000" label="Maximum number of positions to store for each k-mer for database indexing" help="With 0, all positions are stored"/>
    </inputs>

    <outputs>
        <data format_source="input_sequence_file" name="aligned_sequence_file"
            metadata="input_sequence_file" from_work_dir="aligned.dat"
            label="Aligned sequences on ${on_string} (SortMeRNA)">
            <filter>((fastx['fastx_test']))</filter>
        </data>

        <data format_source="input_sequence_file" name="rejected_sequence_file"
            metadata="input_sequence_file" from_work_dir="other_file.dat"
            label="Rejected sequences on ${on_string} (SortMeRNA)">
            <filter>((fastx['fastx_test'] and fastx['fastx_rejected']))</filter>
        </data>

        <data format="sam" name="sam_alignment_file" metadata="input_sequence_file"
            from_work_dir="aligned.sam"
            label="SAM alignments on ${on_string} (SortMeRNA)">
            <filter>((sam['sam_test']]))</filter>
        </data>

        <data format="tabular" name="blast_output_file"
            metadata="input_sequence_file" from_work_dir="aligned.blast"
            label="Blast alignments on ${on_string} (SortMeRNA)">
            <filter>blast_format</filter>
            <change_format>
                <when input="blast_format" value="--blast 0" format="txt" />
            </change_format>
        </data>

        <data format="txt" name="output_log" metadata="input_sequence_file"
            from_work_dir="aligned.log" label="Log on ${on_string} (SortMeRNA)">
            <filter>log</filter>
        </data>
    </outputs>

    <tests>
        <test>
            <param name="input_sequence_file" value="input_sequences.fastq" ftype="fastq"/>
            <param name="databases_selector" value="history" />
            <param name="databases_name" value="db.fasta" ftype="fasta"/>
            <param name="fastx_test" value="--fastx" />
            <param name='fastx_rejected' value="True"/>
            <param name="sam_test" value="" />
            <param name="blast_format" value="--blast 3" />
            <param name='log' value="" />
            <param name="report_type" value="best" />
            <param name="report_best_type" value="1" />
            <param name="report_best_min_lis" value="2" />
            <param name="e_value" value="1" />
            <param name="match" value="2" />
            <param name="mismatch" value="-3" />
            <param name="gap_open" value="5" />
            <param name="gap_ext" value="2" />
            <param name="ambiguous_letter" value="-3" />
            <param name="strand" value="both" />
            <param name="seed_length" value="18" />
            <param name="max_pos" value="10000" />

            <output name="aligned_sequence_file" file="aligned_sequences.fastq" ftype="fastq"/>
            <output name="rejected_sequence_file" file="rejected_sequences.fastq" ftype="fastq"/>
            <output name="blast_output_file" file="blast_output.tabular" ftype="tabular"/>
        </test>
    </tests>

    <help><![CDATA[

**What it does**

SortMeRNA is a tool for RNA filtering based on local sequence alignment against
rRNA database. For more information, check the `user manual <http://bioinfo.lifl.fr/RNA/sortmerna/code/SortMeRNA-user-manual-v1.7.pdf>_`.

-----

**Input**

The input is a sequence file in fasta or fastq and databases to search against.
These databases have to be indexed before the sequence alignment.

SortMeRNA is distributed with 8 rRNA databases constructed from SILVA SSU,LSU
(version 111) and the RFAM 5/5.8S (version 11.0) databases:

    - SILVA 16S bacteria
    - SILVA 16S archaea
    - SILVA 18S eukarya
    - SILVA 23S bacteria
    - SILVA 23s archaea
    - SILVA 28S eukarya
    - Rfam 5S archaea/bacteria
    - Rfam 5.8S eukarya

These databases are available as public ribosomal databases. But local databases
can also be used.

-----

**Parameters**

The database index can be modulated by:

    - Seed length
    - Maximum number of positions to store for each k-mer for database indexing

For RNA sorting, the parameters are:

    - Test to output files in fasta or fastq, in sam and/or in blast format
    - Test for conservation of rejected sequences
    - Choice in blast format
    - Test to add SQ tags in sam file
    - Filtering and read mapping parameters
        - Test for conservation of best alignment or first alignment
        - Number of searched, conserved alignments
    - E-value threshold
    - SW score for a match, for a mismatch, for introducing a gap, for extending a gap, for ambigous letters
    - Strand to search

-----

**Outputs**

Given the choosen parameters, several outputs are possible

    - Sequence file in fasta or fastq with aligned sequences (or conserved)
    - Sequence file in fasta or fastq with rejected sequences
    - File with sam alignments
    - File with blast outputs


    ]]></help>

    <citations>
        <citation type="doi">10.1093/bioinformatics/bts611</citation>
    </citations>
</tool>
author	bebatut
date	Fri, 06 Nov 2015 04:03:38 -0500
parents	152bd01a4e76
children	7f2d9c23be0f