changeset 0:6accbd3a1449 draft

Uploaded
author greg
date Fri, 07 Apr 2017 13:00:29 -0400
parents
children 4f1c3b8c9ab1
files .shed.yml gene_family_aligner.py gene_family_aligner.xml macros.xml plant_tribes_scaffolds.loc.sample run_pasta.py tool_data_table_conf.xml.sample utils.py
diffstat 8 files changed, 473 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.shed.yml	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,13 @@
+name: plant_tribes_gene_family_aligner
+owner: greg
+description: |
+  Contains a tool that integrates de novo assembly sequences with scaffold gene family sequences.
+homepage_url: https://github.com/dePamphilis/PlantTribes
+long_description: |
+  Contains a tool that tool is one of the PlantTribes collection of automated modular analysis pipelines that
+  utilize objective classifications of complete protein sequences from sequenced plant genomes to perform
+  comparative evolutionary studies.  This tool aligns gene family sequences.
+remote_repository_url: https://github.com/gregvonkuster/galaxy_tools/tree/master/tools/plant_tribes/gene_family_aligner
+type: unrestricted
+categories:
+- Phylogenetics
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_family_aligner.py	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,54 @@
+#!/usr/bin/env python
+import argparse
+import subprocess
+
+import utils
+
+OUTPUT_DIR = 'geneFamilyAlignments_dir'
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--alignment_method', dest='alignment_method', help='Multiple sequence alignments method')
+parser.add_argument('--automated_trimming', dest='automated_trimming', default=None, help="Trims alignments using trimAl's ML heuristic trimming approach")
+parser.add_argument('--codon_alignments', dest='codon_alignments', default=None, help="Flag for constructing orthogroup multiple codon alignments")
+parser.add_argument('--gap_trimming', dest='gap_trimming', default=None, type=float, help='Remove sites in alignments with gaps of')
+parser.add_argument('--iterative_realignment', dest='iterative_realignment', type=int, default=None, help='"Maximum number of iterations')
+parser.add_argument('--method', dest='method', help='Protein clustering method')
+parser.add_argument('--num_threads', dest='num_threads', type=int, help='Number of threads to use for execution')
+parser.add_argument('--orthogroup_faa', dest='orthogroup_faa', help="Directory of input fasta datasets")
+parser.add_argument('--output', dest='output', help="Output dataset")
+parser.add_argument('--output_dir', dest='output_dir', help="Output dataset file_path directory")
+parser.add_argument('--pasta_iter_limit', dest='pasta_iter_limit', type=int, default=None, help='"Maximum number of iteration that the PASTA algorithm will execute')
+parser.add_argument('--pasta_script_path', dest='pasta_script_path', default=None, help='Path to script for executing pasta')
+parser.add_argument('--remove_sequences', dest='remove_sequences', default=None, type=float, help='Remove sequences with gaps of')
+parser.add_argument('--scaffold', dest='scaffold', help='Orthogroups or gene families proteins scaffold')
+
+args = parser.parse_args()
+
+# Build the command line.
+cmd = 'GeneFamilyAligner'
+cmd += ' --orthogroup_faa %s' % args.orthogroup_faa
+cmd += ' --scaffold %s' % args.scaffold
+cmd += ' --method %s' % args.method
+cmd += ' --alignment_method %s' % args.alignment_method
+if args.alignment_method == 'pasta':
+    if args.pasta_script_path is not None:
+        cmd += ' --pasta_script_path %s' % args.pasta_script_path
+    if args.pasta_iter_limit is not None:
+        cmd += ' --pasta_iter_limit %d' % args.pasta_iter_limit
+cmd += ' --num_threads %d' % args.num_threads
+if args.orthogroup_fna is not None:
+    cmd += ' --orthogroup_fna'
+if args.automated_trimming is not None:
+    cmd += ' --automated_trimming'
+if args.gap_trimming is not None:
+    cmd += ' --gap_trimming %4f' % args.gap_trimming
+if args.remove_sequences is not None:
+    cmd += ' --remove_sequences %4f' % args.remove_sequences
+if args.iterative_realignment is not None:
+    cmd += ' --iterative_realignment %d' % args.iterative_realignment
+# Run the command.
+proc = subprocess.Popen(args=cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+rc = proc.wait()
+utils.check_execution_errors(rc, proc.stderr)
+utils.move_directory_files(OUTPUT_DIR, args.output_dir)
+utils.write_html_output(args.output, 'Aligned gene family sequences', args.output_dir)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_family_aligner.xml	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,145 @@
+<tool id="plant_tribes_gene_family_aligner" name="GeneFamilyAligner" version="0.8.0">
+    <description>aligns gene family sequences</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements_gene_family_aligner" />
+    <expand macro="stdio" />
+    <command>
+        <![CDATA[
+            #set input_format = $input_format_cond.input_format
+            #set scaffold = $input_format_cond.scaffold
+            #set alignment_method_cond = $input_format_cond.alignment_method_cond
+            #set alignment_method = $alignment_method_cond.alignment_method
+
+            python $__tool_directory__/gene_family_aligner.py
+            --scaffold '$scaffold.fields.path'
+            --method $input_format_cond.method
+            --alignment_method $alignment_method
+            #if str($alignment_method) == 'pasta':
+                --pasta_script_path '$__tool_directory__/run_pasta.py'
+                --pasta_iter_limit $alignment_method_cond.pasta_iter_limit
+            #end if
+            --num_threads \${GALAXY_SLOTS:-4}
+            #if str($input_format) == 'ptortho':
+                --orthogroup_faa '$input_format_cond.input_ptortho.extra_files_path'
+                --output '$output_ptortho'
+                --output_dir '$output_ptortho.files_path'
+            #else:
+                ## str($input_format) == 'ptorthocs'
+                --orthogroup_faa '$input_format_cond.input_ptorthocs.extra_files_path'
+                #if str($input_format_cond.codon_alignments) == 'yes':
+                    --codon_alignments 'true'
+                    --output '$output_ptorthocs'
+                    --output_dir '$output_ptorthocs.files_path'
+                #else:
+                    --output '$output_ptortho'
+                    --output_dir '$output_ptortho.files_path'
+                #end if
+            #end if
+            #if str($options_type_cond.options_type) == 'advanced':
+                #set remove_gappy_sequences_cond = $options_type_cond.remove_gappy_sequences_cond
+                #set remove_gappy_sequences = $remove_gappy_sequences_cond.remove_gappy_sequences
+                #if str($remove_gappy_sequences) == 'yes':
+                    #set trim_type_cond = $remove_gappy_sequences_cond.trim_type_cond
+                    #set trim_type = $trim_type_cond.trim_type
+                    #if str($trim_type) == 'gap_trimming':
+                        --gap_trimming $trim_type_cond.gap_trimming
+                    #else:
+                        ## str($trim_type) == 'automated_trimming'
+                        --automated_trimming 'true'
+                    #end if
+                    #set remove_sequences_with_gaps_cond = $remove_gappy_sequences_cond.remove_sequences_with_gaps_cond
+                    #set remove_sequences_with_gaps = $remove_sequences_with_gaps_cond.remove_sequences_with_gaps
+                    #if str($remove_sequences_with_gaps) == 'yes':
+                        --remove_sequences $remove_sequences_with_gaps_cond.remove_sequences_with_gaps_of
+                        --iterative_realignment $remove_sequences_with_gaps_cond.iterative_realignment
+                    #end if
+                #end if
+            #end if
+        ]]>
+    </command>
+    <inputs>
+        <conditional name="input_format_cond">
+            <param name="input_format" type="select" label="Select type of data to sub sample">
+                <option value="ptortho">Gene family clusters</option>
+                <option value="ptorthocs">Gene family clusters with corresponding coding sequences</option>
+            </param>
+            <when value="ptortho">
+                <param name="input_ptortho" format="ptortho" type="data" label="Gene family clusters" />
+                <expand macro="param_scaffold" />
+                <expand macro="param_method" />
+                <expand macro="cond_alignment_method" />
+            </when>
+            <when value="ptorthocs">
+                <param name="input_ptorthocs" format="ptorthocs" type="data" label="Gene family clusters with corresponding coding sequences" />
+                <expand macro="param_scaffold" />
+                <expand macro="param_method" />
+                <expand macro="cond_alignment_method" />
+                <expand macro="param_codon_alignments" />
+            </when>
+        </conditional>
+        <conditional name="options_type_cond">
+            <expand macro="param_options_type" />
+            <when value="basic" />
+            <when value="advanced">
+                <expand macro="cond_remove_gappy_sequences" />
+            </when>
+        </conditional>
+    </inputs>
+    <outputs>
+        <data name="output_ptortho" format="ptortho" label="Integrated gene family clusters on ${on_string}">
+            <filter>input_format_cond['input_format'] == 'ptortho' or (input_format_cond['input_format'] == 'ptorthocs' and input_format_cond['orthogroup_fna'] == 'no')</filter>
+        </data>
+        <data name="output_ptorthocs" format="ptorthocs" label="Integrated gene family clusters and corresponding coding sequences on ${on_string}">
+            <filter>input_format_cond['input_format'] == 'ptorthocs' and input_format_cond['orthogroup_fna'] == 'yes'</filter>
+        </data>
+    </outputs>
+    <tests>
+        <!-- Test framework does not currently support inputs whose associated files_path contains files to be analyzed.
+        <test>
+        </test>
+        -->
+    </tests>
+    <help>
+This tool is one of the PlantTribes collection of automated modular analysis pipelines for comparative and evolutionary
+analyses of genome-scale gene families and transcriptomes. This tool aligns gene family sequences.
+
+-----
+
+**Required options**
+
+ * **Select type of data to sub sample**
+
+  - **Gene family clusters** - sequences classified into gene family clusters.
+  - **Gene family clusters with corresponding coding sequences** - sequences classified into gene family clusters including corresponding coding sequences.
+
+    - **Construct orthogroup multiple codon alignments** - construct orthogroup multiple codon alignments.
+
+ * **Gene family scaffold** - one of the PlantTribes gene family scaffolds [2-4] installed into Galaxy by the PlantTribes Scaffold Data Manager tool.
+ * **Protein clustering method** - gene family scaffold protein clustering method as described in the AssemblyPostProcessor tool.
+
+ * **Select method for multiple sequence alignments**
+
+  - **MAFFT algorithm** - mafft algorithm.
+  - **Pasta algorithm** - pasta algorithm.
+
+    - **Maximum number of iterations that the PASTA algorithm will execute** - maximum number of iterations that the PASTA algorithm will execute.
+
+**Other options**
+
+ * **Remove gappy sequences in alignments**
+
+  - **Select process used for gap trimming** - either nucleotide based or using trimAl's ML heuristic trimming approach
+
+    - **Nucleotide based**
+
+      - **Remove sites in alignments with gaps of**
+      - **Maximum number of iterations** - maximum number of iterations for iterative orthogroups realignment, trimming and fitering
+
+    </help>
+    <citations>
+        <expand macro="citation1" />
+        <expand macro="citations2to4" />
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,146 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<macros>
+    <xml name="requirements_assembly_post_processor">
+        <requirements>
+            <requirement type="package" version="0.4">plant_tribes_assembly_post_processor</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_gene_family_aligner">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_gene_family_aligner</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_gene_family_classifier">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_gene_family_classifier</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_gene_family_integrator">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_gene_family_integrator</requirement>
+        </requirements>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:"/>
+            <exit_code range=":-1"/>
+            <regex match="Error:"/>
+            <regex match="Exception:"/>
+        </stdio>
+    </xml>
+    <xml name="param_codon_alignments">
+        <param name="codon_alignments" type="select" display="radio" label="Construct orthogroup multiple codon alignments?">
+            <option value="yes" selected="true">Yes</option>
+            <option value="no">No</option>
+        </param>
+    </xml>
+    <xml name="param_method">
+        <param name="method" type="select" label="Protein clustering method">
+            <option value="gfam" selected="true">GFam</option>
+            <option value="orthofinder">OrthoFinder</option>
+            <option value="orthomcl">OrthoMCL</option>
+        </param>
+    </xml>
+    <xml name="param_options_type">
+        <param name="options_type" type="select" label="Options Configuration">
+            <option value="basic" selected="true">Basic</option>
+            <option value="advanced">Advanced</option>
+        </param>
+    </xml>
+    <xml name="param_orthogroup_fna">
+        <param name="orthogroup_fna" type="select" display="radio" label="Process corresponding gene family classification orthogroups CDS fasta files?">
+            <option value="yes" selected="true">Yes</option>
+            <option value="no">No</option>
+        </param>
+    </xml>
+    <xml name="param_scaffold">
+        <param name="scaffold" type="select" label="Orthogroups or gene families proteins scaffold">
+            <options from_data_table="plant_tribes_scaffolds" />
+            <validator type="no_options" message="No PlantTribes scaffolds are available.  Use the PlantTribes Scaffolds Download Data Manager tool in Galaxy to install and populate the PlantTribes scaffolds data table." />
+        </param>
+    </xml>
+    <xml name="cond_alignment_method">
+        <conditional name="alignment_method_cond">
+            <param name="alignments_method" type="select" force_select="true" label="Select method for multiple sequence alignments">
+                <option value="mafft" selected="true">MAFFT algorithm</option>
+                <option value="pasta">PASTA algorithm</option>
+            </param>
+            <when value="mafft" />
+            <when value="pasta">
+                <param name="pasta_iter_limit" type="integer" value="3" min="1" label="Maximum number of iterations that the PASTA algorithm will execute" />
+            </when>
+        </conditional>
+    </xml>
+    <xml name="cond_remove_gappy_sequences">
+        <conditional name="remove_gappy_sequences_cond">
+            <param name="remove_gappy_sequences" type="select" label="Remove gappy sequences in alignments?">
+                <option value="no" selected="true">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="no" />
+            <when value="yes">
+                <conditional name="trim_type_cond">
+                    <param name="trim_type" type="select" label="Select process used for gap trimming">
+                        <option value="gap_trimming" selected="true">Nucleotide based </option>
+                        <option value="automated_trimming">Trim alignments using trimAl's ML heuristic trimming approach</option>
+                    </param>
+                    <when value="gap_trimming">
+                        <param name="gap_trimming" type="float" value="0" min="0" max="1.0" label="Remove sites in alignments with gaps of" help="Zero value has no affect" />
+                    </when>
+                    <when value="automated_trimming" />
+                </conditional>
+                <conditional name="remove_sequences_with_gaps_cond">
+                    <param name="remove_sequences_with_gaps" type="select" label="Remove sequences with specified gaps?">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no" />
+                    <when value="yes">
+                        <param name="remove_sequences_with_gaps_of" type="float" value="0" min="0" max="1" label="Remove sequences with gaps of" help="Zero value has no affect" />
+                        <param name="iterative_realignment" type="integer" value="0" min="0" label="Maximum number of iterations" help="Zero value has no affect"/>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="citation1">
+        <citation type="bibtex">
+            @misc{None,
+            journal = {None},
+            author = {1. Wafula EK},
+            title = {Manuscript in preparation},
+            year = {None},
+            url = {https://github.com/dePamphilis/PlantTribes},}
+        </citation>
+    </xml>
+    <xml name="citations2to4">
+        <citation type="bibtex">
+            @article{Sasidharan2012,
+            journal = {Nucleic Acids Research},
+            author = {2. Sasidharan R, Nepusz T, Swarbreck D, Huala E, Paccanaro A},
+            title = {GFam: a platform for automatic annotation of gene families},
+            year = {2012},
+            pages = {gks631},}
+        </citation>
+        <citation type="bibtex">
+            @article{Li2003,
+            journal = {Genome Research}
+            author = {3. Li L, Stoeckert CJ, Roos DS},
+            title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes},
+            year = {2003},
+            volume = {13},
+            number = {9},
+            pages = {2178-2189},}
+        </citation>
+        <citation type="bibtex">
+            @article{Emms2015,
+            journal = {Genome Biology}
+            author = {4. Emms DM, Kelly S},
+            title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy},
+            year = {2015},
+            volume = {16},
+            number = {1},
+            pages = {157},}
+        </citation>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/plant_tribes_scaffolds.loc.sample	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,4 @@
+## Plant Tribes scaffolds
+#Value	Name	Path	Description
+#22Gv1.0	22Gv1.0	/plant_tribes/scaffolds/22Gv1.0	22 plant genomes (Angiosperms clusters, version 1.0; 22Gv1.0)
+#22Gv1.1	22Gv1.1	/plant_tribes/scaffolds/22Gv1.1	22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/run_pasta.py	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,63 @@
+#! /usr/bin/env python
+
+"""Main script of PASTA in command-line mode - this simply invokes the main
+    function found in pasta/mainpasta.py
+"""
+
+# This file is part of PASTA which is forked from SATe
+
+# PASTA like SATe is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Jiaye Yu and Mark Holder, University of Kansas
+
+if __name__ == "__main__":
+    import os
+    import sys
+    from pasta.mainpasta import pasta_main
+    from pasta import MESSENGER
+    sys.setrecursionlimit(100000)
+    _PASTA_DEBUG = os.environ.get('PASTA_DEBUG')
+    _DEVELOPER = _PASTA_DEBUG and _PASTA_DEBUG != '0'
+
+    if not _DEVELOPER:
+        _PASTA_DEVELOPER = os.environ.get('PASTA_DEVELOPER')
+        _DEVELOPER = _PASTA_DEVELOPER and _PASTA_DEVELOPER != '0'
+    try:
+        rc, temp_dir, temp_fs = pasta_main()
+        if not rc:
+            raise ValueError("Unknown PASTA execution error")
+        if (temp_dir is not None) and (os.path.exists(temp_dir)):
+            MESSENGER.send_info("Note that temporary files from the run have not been deleted, they can be found in:\n   '%s'\n" % temp_dir)
+            if sys.platform.lower().startswith('darwin') and ("'" not in temp_dir):
+                MESSENGER.send_info('''
+If you cannot see this directory in the Finder application, you may want to use
+the 'open' command executed from a Terminal.  You can do this by launching the
+/Applications/Utilities/Terminal program and then typing
+
+open '%s'
+
+followed by a return at the prompt. If the argument to the open command is a
+directory, then it should open a Finder window in the directory (even if that
+directory is hidden by default).
+''' % temp_dir)
+    except Exception, x:
+        if _DEVELOPER:
+            raise
+        message = "PASTA is exiting because of an error:\n%s " % str(x)
+        try:
+            from pasta import MESSENGER
+            MESSENGER.send_error(message)
+        except:
+            sys.stderr.write(message)
+        sys.exit(1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,6 @@
+<tables>
+    <table name="plant_tribes_scaffolds" comment_char="#">
+        <columns>value, name, path, description</columns>
+        <file path="tool-data/plant_tribes_scaffolds.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py	Fri Apr 07 13:00:29 2017 -0400
@@ -0,0 +1,42 @@
+import os
+import shutil
+import sys
+
+
+def check_execution_errors(rc, stderr):
+    if rc != 0:
+        stop_err(stderr.read())
+
+
+def move_directory_files(source_dir, destination_dir):
+    source_directory = os.path.abspath(source_dir)
+    destination_directory = os.path.abspath(destination_dir)
+    if not os.path.isdir(destination_directory):
+        os.makedirs(destination_directory)
+    for dir_entry in os.listdir(source_directory):
+        source_entry = os.path.join(source_directory, dir_entry)
+        shutil.move(source_entry, destination_directory)
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit(1)
+
+
+def write_html_output(output, title, dir):
+    with open(output, 'w') as fh:
+        fh.write('<html><head><h3>%s</h3></head>\n' % title)
+        fh.write('<body><p/><table cellpadding="2">\n')
+        fh.write('<tr><th>Size</th><th>Name</th></tr>\n')
+        for index, fname in enumerate(sorted(os.listdir(dir))):
+            if index % 2 == 0:
+                bgcolor = '#D8D8D8'
+            else:
+                bgcolor = '#FFFFFF'
+            try:
+                size = str(os.path.getsize(os.path.join(dir, fname)))
+            except:
+                size = 'unknown'
+            link = '<a href="%s" type="text/plain">%s</a>\n' % (fname, fname)
+            fh.write('<tr bgcolor="%s"><td>%s</td><td>%s</td></tr>\n' % (bgcolor, size, link))
+        fh.write('</table></body></html>\n')