view gene_family_classifier.xml @ 50:a3b7f34ad3f7 draft

Uploaded
author greg
date Wed, 08 Feb 2017 11:17:10 -0500
parents 539a92a13168
children 3b2d35b75268
line wrap: on
line source

<tool id="plant_tribes_gene_family_classifier" name="Classify gene sequences" version="0.3">
    <description>into precomputed orthologous gene family clusters</description>
    <requirements>
        <requirement type="package" version="0.3">plant_tribes_gene_family_classifier</requirement>
    </requirements>
    <stdio>
        <!-- Anything other than zero is an error -->
        <exit_code range="1:" />
        <exit_code range=":-1" />
        <!-- In case the return code has not been set propery check stderr too -->
        <regex match="Error:" />
        <regex match="Exception:" />
    </stdio>
    <command>
        <![CDATA[
            #import os
            #set scaffold_path = $scaffold.fields.path
            #set scaffold_dir = $os.path.split($scaffold_path)[0]
            #set scaffold_selection = $os.path.split($scaffold_path)[1]

            #if str($options_type.options_type_selector) == 'advanced':
                #set create_orthogroup_cond = $options_type.create_orthogroup_cond
                #set create_orthogroup = $create_orthogroup_cond.create_orthogroup
                #set specify_single_copy_cond = $options_type.specify_single_copy_cond
                #set specify_single_copy = $specify_single_copy_cond.specify_single_copy

                #if str($specify_single_copy) == 'yes':
                    #set single_copy_orthogroup = True
                    #set single_copy_fasta_src_dir = $os.path.join('geneFamilyClassification_dir', 'single_copy_fasta')
                    #set single_copy_fasta_dest_dir = $output_ptsco.extra_files_path
                    #set single_copy_cond = $specify_single_copy_cond.single_copy_cond
                    ## FixMe: the single_copy_custom option is not currently supported.
                    #set single_copy = $single_copy_cond.single_copy
                    mkdir -p $single_copy_fasta_dest_dir &&
                #else:
                    #set single_copy_orthogroup = False
                #end if

                #if str($create_orthogroup) == 'yes':
                    #set create_ortho_sequences = True
                    #set orthogroups_fasta_src_dir = $os.path.join('geneFamilyClassification_dir', 'orthogroups_fasta')
                    #set create_corresponding_coding_sequences_cond = $create_orthogroup_cond.create_corresponding_coding_sequences_cond

                    #if str($create_corresponding_coding_sequences_cond.create_corresponding_coding_sequences) == 'yes':
                        #set create_corresponding_coding_sequences = True
                        #set orthogroups_fasta_dest_dir = $output_ptcgfcs.extra_files_path
                    #else:
                        #set create_corresponding_coding_sequences = False
                        #set orthogroups_fasta_dest_dir = $output_ptcgf.extra_files_path
                    #end if
                    mkdir -p $orthogroups_fasta_dest_dir &&
                #else:
                    #set create_ortho_sequences = False
                    #set create_corresponding_coding_sequences = False
                #end if
            #else:
                #set single_copy_orthogroup = False
                #set create_ortho_sequences = False
                #set create_corresponding_coding_sequences = False
            #end if

            GeneFamilyClassifier
            --proteins '$input'
            --scaffold_dir '$scaffold_dir'
            --scaffold '$scaffold_selection'
            --method $method
            --classifier $save_hmmscan_log_cond.classifier
            --num_threads \${GALAXY_SLOTS:-4}

            #if str($options_type.options_type_selector) == 'advanced':
                --super_orthogroups $options_type.super_orthogroups
                #if $single_copy_orthogroup:
                    --single_copy_taxa $single_copy_cond.single_copy_taxa
                    --taxa_present $single_copy_cond.taxa_present
                #end if
                #if str($create_orthogroup) == 'yes':
                    --orthogroup_fasta
                    #if $create_corresponding_coding_sequences:
                        --coding_sequences '$create_corresponding_coding_sequences_cond.coding_sequences'
                    #end if
                #end if
            #end if
            >/dev/null

            #if str($save_hmmscan_log_cond.classifier) == 'hmmscan' or str($save_hmmscan_log_cond.classifier) == 'both':
                #if str($save_hmmscan_log_cond.save_hmmscan_log) == 'yes':
                    && mv geneFamilyClassification_dir/hmmscan.log $hmmscan_log
                #else:
                    && rm geneFamilyClassification_dir/hmmscan.log
                #end if
            #end if

            #if $create_ortho_sequences:
                #if $create_corresponding_coding_sequences:
                    && ls -l $orthogroups_fasta_src_dir | grep f | awk -F ' ' '{print $5"\t"$9}' > $output_ptcgfcs
                #else:
                    && ls -l $orthogroups_fasta_src_dir | grep f | awk -F ' ' '{print $5"\t"$9}' > $output_ptcgf
                #end if
                #if $os.path.exists($orthogroups_fasta_src_dir) and len($os.listdir($orthogroups_fasta_src_dir)) > 0:
                    && mv $orthogroups_fasta_src_dir/* $orthogroups_fasta_dest_dir
                #end if
            #end if

            #if $single_copy_orthogroup:
                && ls -l $single_copy_fasta_src_dir | grep f | awk -F ' ' '{print $5"\t"$9}' > $output_ptsco
                #if $os.path.exists($single_copy_fasta_src_dir) and len($os.listdir($single_copy_fasta_src_dir)) > 0:
                    && mv $single_copy_fasta_src_dir/* $single_copy_fasta_dest_dir
                #end if
            #end if
        ]]>
    </command>
    <inputs>
        <param name="input" format="fasta" type="data" label="Amino acids (proteins) sequences fasta file"/>
        <param name="scaffold" type="select" label="Orthogroups or gene families proteins scaffold">
            <options from_data_table="plant_tribes_scaffolds" />
            <validator type="no_options" message="No PlantTribes scaffolds are available.  Use the PlantTribes Scaffolds Download Data Manager tool in Galaxy to install and populate the PlantTribes scaffolds data table."/>
        </param>
        <param name="method" type="select" label="Protein clustering method">
            <option value="gfam" selected="true">GFam</option>
            <option value="orthofinder">OrthoFinder</option>
            <option value="orthomcl">OrthoMCL</option>
        </param>
        <conditional name="save_hmmscan_log_cond">
            <param name="classifier" type="select" label="Protein classification method">
                <option value="blastp" selected="true">blastp</option>
                <option value="hmmscan">HMMScan</option>
                <option value="both">Both blastp and HMMScan</option>
            </param>
            <when value="blastp" />
            <when value="hmmscan">
                <param name="save_hmmscan_log" type="select" label="Save hmmscan log?" help="Save the hmmscan log in an additional output dataset">
                    <option value="no" selected="true">No</option>
                    <option value="yes">Yes</option>
                </param>
            </when>
            <when value="both">
                <param name="save_hmmscan_log" type="select" label="Save hmmscan log?" help="Save the hmmscan log in an additional output dataset">
                    <option value="no" selected="true">No</option>
                    <option value="yes">Yes</option>
                </param>
            </when>
        </conditional>
        <conditional name="options_type">
            <param name="options_type_selector" type="select" label="Options Configuration">
                <option value="basic" selected="true">Basic</option>
                <option value="advanced">Advanced</option>
            </param>
            <when value="basic" />
            <when value="advanced">
                <param name="super_orthogroups" type="select" label="Super Orthogroups" help="Secondary MCL clusters of orthogroups">
                    <option value="min_evalue" selected="true">Minimum e-value</option>
                    <option value="avg_evalue">Average e-value</option>
                </param>
                <conditional name="specify_single_copy_cond">
                    <param name="specify_single_copy" type="select" label="Specify single copy orthogroup selection?">
                        <option value="no" selected="true">No</option>
                        <option value="yes">Yes</option>
                    </param>
                    <when value="no"/>
                    <when value="yes">
                        <conditional name="single_copy_cond">
                            <param name="single_copy" type="select" label="Select single copy configuration">
                                <!--
                                FixMe: single_copy_custom is not currently supported because the
                                ~/config/*.singleCopy.config files are not easy to manage by a suer
                                and the GeneFamilyClassifier pipeline must be enhanced to provide an
                                optional parameter for the hard-coded path to the config.
                                <option value="custom" selected="true">Single copy orthogroup custom</option>
                                -->
                                <option value="taxa">Minumum single copy taxa required in orthogroup</option>
                            </param>
                            <when value="custom" />
                            <when value="taxa">
                                <param name="single_copy_taxa" type="integer" value="20" label="Minumum single copy taxa required in orthogroup"/>
                                <param name="taxa_present" type="integer" value="21" label="Minumum taxa required in single copy orthogroup"/>
                            </when>
                        </conditional>
                    </when>
                </conditional>
                <conditional name="create_orthogroup_cond">
                    <param name="create_orthogroup" type="select" label="Create orthogroup fasta files?">
                        <option value="no" selected="true">No</option>
                        <option value="yes">Yes</option>
                    </param>
                    <when value="no" />
                    <when value="yes">
                        <conditional name="create_corresponding_coding_sequences_cond">
                            <param name="create_corresponding_coding_sequences" type="select" label="Create corresponding coding sequences?">
                                <option value="no" selected="true">No</option>
                                <option value="yes">Yes</option>
                            </param>
                            <when value="no" />
                            <when value="yes">
                                <param name="coding_sequences" format="fasta" type="data" label="Corresponding coding sequences (CDS) fasta file"/>
                            </when>
                        </conditional>
                    </when>
                </conditional>
            </when>
        </conditional>
    </inputs>
    <outputs>
        <data name="hmmscan_log" format="txt" label="Protein classification hmmscan.log on ${on_string}">
            <filter>save_hmmscan_log_cond['classifier'] in ['hmmscan', 'both'] and save_hmmscan_log_cond['save_hmmscan_log'] == 'yes'</filter>
        </data>
        <data name="output_ptcgf" format="ptcgf" label="Gene family clusters on ${on_string}">
            <filter>options_type['options_type_selector'] == 'advanced' and options_type['create_orthogroup_cond']['create_orthogroup'] == 'yes' and options_type['create_orthogroup_cond']['create_corresponding_coding_sequences_cond']['create_corresponding_coding_sequences'] == 'no'</filter>
        </data>
        <data name="output_ptcgfcs" format="ptcgfcs" label="Gene family clusters and corresponding coding sequences on ${on_string}">
            <filter>options_type['options_type_selector'] == 'advanced' and options_type['create_orthogroup_cond']['create_orthogroup'] == 'yes' and options_type['create_orthogroup_cond']['create_corresponding_coding_sequences_cond']['create_corresponding_coding_sequences'] == 'yes'</filter>
        </data>
        <data name="output_ptsco" format="txt" label="Single copy orthogroup on ${on_string}">
            <filter>options_type['options_type_selector'] == 'advanced' and options_type['specify_single_copy_cond']['specify_single_copy'] == 'yes'</filter>
        </data>
        <collection name="orthos" type="list">
            <discover_datasets pattern="__name__" directory="geneFamilyClassification_dir" visible="false" ext="tabular" />
        </collection>
    </outputs>
    <tests>
        <!-- Not sure how to test this since the tool requires scaffolds data which is extremely large and installed using a Data Manager -->
        <!--
        <test>
            <param name="input" value="transcripts.cleaned.nr.pep" ftype="fasta" />
            <param name="scaffold" value="22Gv1.1"/>
            <param name="method" value="orthomcl"/>
            <param name="classifier" value="blastp"/>
            <param name="dereplicate" value="yes"/>
            <param name="min_length" value="200"/>
            <output_collection name="orthos" type="list">
                <element name="proteins.blastp.22Gv1.1" file="proteins.blastp.22Gv1.1" ftype="tabular"/>
                <element name="proteins.blastp.22Gv1.1.bestOrthos" file="proteins.blastp.22Gv1.1.bestOrthos" ftype="tabular"/>
                <element name="proteins.blastp.22Gv1.1.bestOrthos.summary" file="proteins.blastp.22Gv1.1.bestOrthos.summary" ftype="tabular"/>
            </output_collection>
        </test>
        -->
    </tests>
    <help>
This tool is one of the PlantTribes' collection of automated modular analysis pipelines that utilize objective classifications of
complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies.  This tool classifies gene
sequences into precomputed orthologous gene family clusters using either blastp (faster), HMMScan (slower but more sensitive
to remote homologs) or both (more exhaustive).

This tool accepts any of the following as input:

* the postprocessed assemblies produced by the **Postprocess de novo assembly transcripts into putative coding sequences** tool
* externally predicted coding sequences and their corresponding amino acid translations derived from a transcriptome assembly
* gene predictions from a sequenced genome

-----

**Options**

 * **Orthogroups or gene families proteins scaffold** - PlantTribes scaffolds data.
 * **Protein clustering method** - One of GFam (domain architecture based clustering), OrthoFinder (broadly defined clusters) or OrthoMCL (narrowly defined clusters).
 * **Protein classification method** - blastp (faster), HMMScan (slower but more sensative to the remote homologs) or both (more exhaustive).
 * **Super Orthogroups** - Secondary MCL clusters of orthogroups.
 * **Specify single copy orthogroup selection?** - Select 'Yes' to specify one of 'Single copy orthogroup custom' (not yet supported) or 'Minumum single copy taxa required in orthogroup'.
 * **Minumum single copy taxa required in orthogroup** - Used with "Minumum single copy taxa required in orthogroup" configuration only.
 * **Minumum taxa required in single copy orthogroup** - Used with "Minumum single copy taxa required in orthogroup" configuration only.
 * **Corresponding coding sequences (CDS) fasta file** - Used only when selecting "Create orthogroup fasta files?".
 
    </help>
    <citations>
            <citation type="bibtex">
                @unpublished{None,
                author = {Eric Wafula},
                title = {None},
                year = {None},
                url = {https://github.com/dePamphilis/PlantTribes}
            }</citation>
            <citation type="doi">10.1186/1471-2105-10-421</citation>
            <citation type="bibtex">
                @unpublished{None,
                author = {None},
                title = {HMMER 3.1+ hmmscan search sequence(s) against a profile database},
                year = {2013},
                url = {http://hmmer.org/}
            }</citation>
    </citations>
</tool>