changeset 0:f4361d941aa3 draft

Uploaded
author greg
date Tue, 11 Apr 2017 11:05:24 -0400
parents
children e5523659847a
files gene_family_phylogeny_builder.py gene_family_phylogeny_builder.xml macros.xml plant_tribes_scaffolds.loc.sample tool_data_table_conf.xml.sample utils.py
diffstat 6 files changed, 421 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_family_phylogeny_builder.py	Tue Apr 11 11:05:24 2017 -0400
@@ -0,0 +1,49 @@
+#!/usr/bin/env python
+import argparse
+import subprocess
+
+import utils
+
+OUTPUT_DIR = 'phylogenomicsAnalysis_dir'
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--alignments_type', dest='alignments_type', help='Input alignments type produced by the GeneFamilyAligner')
+parser.add_argument('--bootstrap_replicates', dest='bootstrap_replicates', type=int, default=None, help='Number of replicates for rapid bootstrap analysis')
+parser.add_argument('--config_dir', dest='config_dir', help='Directory containing default configuration files')
+parser.add_argument('--max_orthogroup_size', dest='max_orthogroup_size', type=int, help='Maximum number of sequences in orthogroup alignments')
+parser.add_argument('--method', dest='method', help='Protein clustering method')
+parser.add_argument('--min_orthogroup_size', dest='min_orthogroup_size', type=int, help='Minimum number of sequences in orthogroup alignments')
+parser.add_argument('--num_threads', dest='num_threads', type=int, help='Number of threads to use for execution')
+parser.add_argument('--orthogroup_aln', dest='orthogroup_aln', help="Input dataset files_path")
+parser.add_argument('--output', dest='output', help='Output for phylogenetic trees')
+parser.add_argument('--output_dir', dest='output_dir', help='output.files_path')
+parser.add_argument('--rooting_order', dest='rooting_order', default=None, help='Rooting order configuration for rooting trees')
+parser.add_argument('--scaffold', dest='scaffold', help='Orthogroups or gene families proteins scaffold')
+parser.add_argument('--sequence_type', dest='sequence_type', help="Sequence type used in the phylogenetic inference")
+parser.add_argument('--tree_inference', dest='tree_inference', help='Phylogenetic trees inference method')
+
+args = parser.parse_args()
+
+# Build the command line.
+cmd = 'GeneFamilyPhylogenyBuilder'
+cmd += ' --alignment_type %s' % args.tree_inference
+if args.bootstrap_replicates is not None:
+    cmd += ' --bootstrap_replicates %d' % args.bootstrap_replicates
+cmd += ' --config_dir %s' % args.config_dir
+cmd += ' --max_orthogroup_size %d' % args.max_orthogroup_size
+cmd += ' --method %s' % args.method
+cmd += ' --min_orthogroup_size %d' % args.min_orthogroup_size
+cmd += ' --num_threads %d' % args.num_threads
+cmd += ' --orthogroup_aln %s' % args.orthogroup_aln
+if args.rooting_order is not None:
+    cmd += ' --rooting_order %s' % args.rooting_order
+cmd += ' --scaffold %s' % args.scaffold
+cmd += ' --sequence_type %s' % args.sequence_type
+cmd += ' --tree_inference %s' % args.tree_inference
+# Run the command.
+proc = subprocess.Popen(args=cmd, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
+rc = proc.wait()
+utils.check_execution_errors(rc, proc.stderr)
+utils.move_directory_files(OUTPUT_DIR, args.output_dir)
+utils.write_html_output(args.output, 'Phylogenetic trees', args.output_dir)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gene_family_phylogeny_builder.xml	Tue Apr 11 11:05:24 2017 -0400
@@ -0,0 +1,157 @@
+<tool id="plant_tribes_gene_family_phylogeny_builder" name="GeneFamilyPhylogenyBuilder" version="@WRAPPER_VERSION@.0">
+    <description>builds gene family phylogenetic trees</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements_gene_family_phylogeny_builder" />
+    <expand macro="stdio" />
+    <command>
+        <![CDATA[
+            #set input_format = $input_format_cond.input_format
+            #set tree_inference_cond = $phylogenetic_trees_cond.tree_inference_cond
+            #set tree_inference = $tree_inference_cond.tree_inference
+            #if str($tree_inference) == 'raxml':
+                #set rooting_order_file_cond = $tree_inference_cond.rooting_order_file_cond
+                #set rooting_order_file = $rooting_order_file_cond.rooting_order_file
+                #if str($rooting_order_file) == 'yes':
+                    #set rooting_order = $rooting_order_file_cond.rooting_order
+                #end if
+                #set bootstrap_replicates = $tree_inference_cond.bootstrap_replicates
+            #end if
+
+            python $__tool_directory__/gene_family_phylogeny_builder.py
+            --orthogroup_aln '$input_aln.extra_files_path'
+            --scaffold '$scaffold.fields.path'
+            --config_dir '$scaffold.fields.path'
+            --method $method
+            --tree_inference $tree_inference
+            #if str($tree_inference) == 'raxml':
+                #if str($rooting_order_file) == 'yes':
+                    --rooting_order '$rooting_order'
+                    ## No else block needed here because the default rooting_order
+                    ## configuration will be used if the --rooting_order flag is missing.
+                #end if
+                --bootstrap_replicates $bootstrap_replicates
+            #end if
+            --min_orthogroup_size $min_orthogroup_size
+            --max_orthogroup_size $max_orthogroup_size
+            --num_threads \${GALAXY_SLOTS:-4}
+            #if str($input_format) == 'ptalign':
+                --alignment_type 'aln'
+                --sequence_type 'protein'
+            #else if str($input_format) == 'ptalignca':
+                --alignment_type 'aln'
+                --sequence_type $input_format_cond.sequence_type
+            #else if str($input_format) == 'ptalignfiltered':
+                --alignment_type 'filter'
+                --sequence_type 'protein'
+            #else if str($input_format) == 'ptalignfilteredca':
+                --alignment_type 'filter'
+                --sequence_type $input_format_cond.sequence_type
+            #else if str($input_format) == 'ptaligntrimmed':
+                --alignment_type 'trim'
+                --sequence_type 'protein'
+            #else if str($input_format) == 'ptaligntrimmedca':
+                --alignment_type 'trim'
+                --sequence_type $input_format_cond.sequence_type
+            #end if
+            --output '$output_pttree'
+            --output_dir '$output_pttree.files_path'
+        ]]>
+    </command>
+    <inputs>
+        <conditional name="input_format_cond">
+            <param name="input_format" type="select" label="Select type of data">
+                <option value="ptalign">Aligned gene family sequences</option>
+                <option value="ptalignca">Aligned gene family sequences with codon alignments</option>
+                <option value="ptalignfiltered">Filtered aligned gene family sequences</option>
+                <option value="ptalignfilteredca">Filtered aligned gene family sequences with codon alignments</option>
+                <option value="ptaligntrimmed">Trimmed aligned gene family sequences</option>
+                <option value="ptaligntrimmedca">Trimmed aligned gene family sequences with codon alignments</option>
+            </param>
+            <when value="ptalign">
+                <param name="input_ptalign" format="ptalign" type="data" label="Aligned gene family sequences">
+                    <validator type="empty_files_path" />
+                </param>
+            </when>
+            <when value="ptalignca">
+                <param name="input_ptalignca" format="ptalignca" type="data" label="Aligned gene family sequences with codon alignments">
+                    <validator type="empty_files_path" />
+                </param>
+                <expand macro="param_sequence_type" />
+            </when>
+            <when value="ptalignfiltered">
+                <param name="input_ptalignfiltered" format="ptalignfiltered" type="data" label="Filtered aligned gene family sequences">
+                    <validator type="empty_files_path" />
+                </param>
+            </when>
+            <when value="ptalignfilteredca">
+                <param name="input_ptalignfilteredca" format="ptalignfilteredca" type="data" label="Filtered aligned gene family sequences with codon alignments">
+                    <validator type="empty_files_path" />
+                </param>
+                <expand macro="param_sequence_type" />
+            </when>
+            <when value="ptaligntrimmed">
+                <param name="input_ptaligntrimmed" format="ptaligntrimmed" type="data" label="Trimmed aligned gene family sequences">
+                    <validator type="empty_files_path" />
+                </param>
+            </when>
+            <when value="ptaligntrimmedca">
+                <param name="input_ptaligntrimmedca" format="ptaligntrimmedca" type="data" label="Trimmed aligned gene family sequences with codon alignments">
+                    <validator type="empty_files_path" />
+                </param>
+                <expand macro="param_sequence_type" />
+            </when>
+        </conditional>
+        <expand macro="param_scaffold" />
+        <expand macro="param_method" />
+        <conditional name="tree_inference_cond">
+            <param name="tree_inference" type="select" label="Phylogenetic trees inference method">
+                <option value="raxml" selected="true">RAxML</option>
+                <option value="fasttree">FastTree</option>
+            </param>
+            <when value="raxml">
+                <conditional name="rooting_order_file_cond">
+                    <param name="rooting_order_file" type="select" label="Select rooting order configuration for rooting trees?" help="Select No for the default rooting order configuration which uses the most distant taxon present in the orthogroup">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no" />
+                    <when value="yes">
+                        <param name="rooting_order" format="txt" type="data" label="Rooting order configuration for rooting trees" />
+                    </when>
+                </conditional>
+                <param name="bootstrap_replicates" type="integer" value="100" min="0" label="Number of replicates for rapid bootstrap analysis and search for the best-scoring ML tree" />
+            </when>
+            <when value="fasttree" />
+        </conditional>
+        <param name="max_orthogroup_size" type="integer" value="100" min="0" label="Maximum number of sequences in orthogroup alignments" />
+        <param name="min_orthogroup_size" type="integer" value="4" min="0" label="Minimum number of sequences in orthogroup alignments" />
+    </inputs>
+    <outputs>
+        <data name="output_pttree" format="pttree" label="Gene family phylogenetic trees on ${on_string}" />
+    </outputs>
+    <tests>
+        <!-- Test framework does not currently support inputs whose associated files_path contains files to be analyzed.
+        <test>
+        </test>
+        -->
+    </tests>
+    <help>
+This tool is one of the PlantTribes collection of automated modular analysis pipelines that utilize objective classifications of
+complete protein sequences from sequenced plant genomes to perform comparative evolutionary studies. It performs phylogenomic
+analyses by creating multiple sequence alignments and inferred maximum likelihood phylogenies for orthogroups produced by the
+**GeneFamilyAligner** tool.
+
+-----
+
+**Required options**
+
+**Other options**
+
+    </help>
+    <citations>
+        <expand macro="citation1" />
+        <expand macro="citations2to4" />
+    </citations>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Tue Apr 11 11:05:24 2017 -0400
@@ -0,0 +1,163 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<macros>
+    <token name="@WRAPPER_VERSION@">0.8</token>
+    <xml name="requirements_assembly_post_processor">
+        <requirements>
+            <requirement type="package" version="0.4">plant_tribes_assembly_post_processor</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_gene_family_aligner">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_gene_family_aligner</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_gene_family_classifier">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_gene_family_classifier</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_gene_family_integrator">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_gene_family_integrator</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_kaks_analysis">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_kaks_analysis</requirement>
+        </requirements>
+    </xml>
+    <xml name="requirements_gene_family_phylogeny_builder">
+        <requirements>
+            <requirement type="package" version="0.8">plant_tribes_gene_family_phylogeny_builder</requirement>
+        </requirements>
+    </xml>
+    <xml name="stdio">
+        <stdio>
+            <exit_code range="1:"/>
+            <exit_code range=":-1"/>
+            <regex match="Error:"/>
+            <regex match="Exception:"/>
+        </stdio>
+    </xml>
+    <xml name="param_codon_alignments">
+        <param name="codon_alignments" type="select" label="Construct orthogroup multiple codon alignments?">
+            <option value="yes" selected="true">Yes</option>
+            <option value="no">No</option>
+        </param>
+    </xml>
+    <xml name="param_method">
+        <param name="method" type="select" label="Protein clustering method">
+            <option value="gfam" selected="true">GFam</option>
+            <option value="orthofinder">OrthoFinder</option>
+            <option value="orthomcl">OrthoMCL</option>
+        </param>
+    </xml>
+    <xml name="param_options_type">
+        <param name="options_type" type="select" label="Options Configuration">
+            <option value="basic" selected="true">Basic</option>
+            <option value="advanced">Advanced</option>
+        </param>
+    </xml>
+    <xml name="param_orthogroup_fna">
+        <param name="orthogroup_fna" type="select" label="Process corresponding gene family classification orthogroups CDS fasta files?">
+            <option value="yes" selected="true">Yes</option>
+            <option value="no">No</option>
+        </param>
+    </xml>
+    <xml name="param_scaffold">
+        <param name="scaffold" type="select" label="Orthogroups or gene families proteins scaffold">
+            <options from_data_table="plant_tribes_scaffolds" />
+            <validator type="no_options" message="No PlantTribes scaffolds are available.  Use the PlantTribes Scaffolds Download Data Manager tool in Galaxy to install and populate the PlantTribes scaffolds data table." />
+        </param>
+    </xml>
+    <xml name="param_sequence_type">
+        <param name="sequence_type" type="select" label="Sequence type used in the phylogenetic inference (dna)">
+            <option value="protein" selected="true">Amino acid based</option>
+            <option value="dna">Nucleotide based</option>
+        </param>
+    </xml>
+    <xml name="cond_alignment_method">
+        <conditional name="alignment_method_cond">
+            <param name="alignment_method" type="select" force_select="true" label="Select method for multiple sequence alignments">
+                <option value="mafft" selected="true">MAFFT algorithm</option>
+                <option value="pasta">PASTA algorithm</option>
+            </param>
+            <when value="mafft" />
+            <when value="pasta">
+                <param name="pasta_iter_limit" type="integer" value="3" min="1" label="Maximum number of iterations that the PASTA algorithm will execute" />
+            </when>
+        </conditional>
+    </xml>
+    <xml name="cond_remove_gappy_sequences">
+        <conditional name="remove_gappy_sequences_cond">
+            <param name="remove_gappy_sequences" type="select" label="Remove gappy sequences in alignments?">
+                <option value="no" selected="true">No</option>
+                <option value="yes">Yes</option>
+            </param>
+            <when value="no" />
+            <when value="yes">
+                <conditional name="trim_type_cond">
+                    <param name="trim_type" type="select" label="Select process used for trimming">
+                        <option value="gap_trimming" selected="true">Remove gappy sites in alignments (gap trimming)</option>
+                        <option value="automated_trimming">Trim alignments using trimAl's ML heuristic trimming approach (automated trimming)</option>
+                    </param>
+                    <when value="gap_trimming">
+                        <param name="gap_trimming" type="float" value="0" min="0" max="1.0" label="Remove sites in alignments with gaps of" help="Zero value has no affect" />
+                    </when>
+                    <when value="automated_trimming" />
+                </conditional>
+                <conditional name="remove_sequences_with_gaps_cond">
+                    <param name="remove_sequences_with_gaps" type="select" label="Remove sequences with specified gaps?">
+                        <option value="no" selected="true">No</option>
+                        <option value="yes">Yes</option>
+                    </param>
+                    <when value="no" />
+                    <when value="yes">
+                        <param name="remove_sequences_with_gaps_of" type="float" value="0" min="0" max="1" label="Remove sequences with gaps of" help="Zero value has no affect" />
+                        <param name="iterative_realignment" type="integer" value="0" min="0" label="Maximum number of iterations" help="Zero value has no affect"/>
+                    </when>
+                </conditional>
+            </when>
+        </conditional>
+    </xml>
+    <xml name="citation1">
+        <citation type="bibtex">
+            @misc{None,
+            journal = {None},
+            author = {1. Wafula EK},
+            title = {Manuscript in preparation},
+            year = {None},
+            url = {https://github.com/dePamphilis/PlantTribes},}
+        </citation>
+    </xml>
+    <xml name="citations2to4">
+        <citation type="bibtex">
+            @article{Sasidharan2012,
+            journal = {Nucleic Acids Research},
+            author = {2. Sasidharan R, Nepusz T, Swarbreck D, Huala E, Paccanaro A},
+            title = {GFam: a platform for automatic annotation of gene families},
+            year = {2012},
+            pages = {gks631},}
+        </citation>
+        <citation type="bibtex">
+            @article{Li2003,
+            journal = {Genome Research}
+            author = {3. Li L, Stoeckert CJ, Roos DS},
+            title = {OrthoMCL: identification of ortholog groups for eukaryotic genomes},
+            year = {2003},
+            volume = {13},
+            number = {9},
+            pages = {2178-2189},}
+        </citation>
+        <citation type="bibtex">
+            @article{Emms2015,
+            journal = {Genome Biology}
+            author = {4. Emms DM, Kelly S},
+            title = {OrthoFinder: solving fundamental biases in whole genome comparisons dramatically improves orthogroup inference accuracy},
+            year = {2015},
+            volume = {16},
+            number = {1},
+            pages = {157},}
+        </citation>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/plant_tribes_scaffolds.loc.sample	Tue Apr 11 11:05:24 2017 -0400
@@ -0,0 +1,4 @@
+## Plant Tribes scaffolds
+#Value	Name	Path	Description
+#22Gv1.0	22Gv1.0	/plant_tribes/scaffolds/22Gv1.0	22 plant genomes (Angiosperms clusters, version 1.0; 22Gv1.0)
+#22Gv1.1	22Gv1.1	/plant_tribes/scaffolds/22Gv1.1	22 plant genomes (Angiosperms clusters, version 1.1; 22Gv1.1)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool_data_table_conf.xml.sample	Tue Apr 11 11:05:24 2017 -0400
@@ -0,0 +1,6 @@
+<tables>
+    <table name="plant_tribes_scaffolds" comment_char="#">
+        <columns>value, name, path, description</columns>
+        <file path="tool-data/plant_tribes_scaffolds.loc" />
+    </table>
+</tables>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.py	Tue Apr 11 11:05:24 2017 -0400
@@ -0,0 +1,42 @@
+import os
+import shutil
+import sys
+
+
+def check_execution_errors(rc, stderr):
+    if rc != 0:
+        stop_err(stderr.read())
+
+
+def move_directory_files(source_dir, destination_dir):
+    source_directory = os.path.abspath(source_dir)
+    destination_directory = os.path.abspath(destination_dir)
+    if not os.path.isdir(destination_directory):
+        os.makedirs(destination_directory)
+    for dir_entry in os.listdir(source_directory):
+        source_entry = os.path.join(source_directory, dir_entry)
+        shutil.move(source_entry, destination_directory)
+
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit(1)
+
+
+def write_html_output(output, title, dir):
+    with open(output, 'w') as fh:
+        fh.write('<html><head><h3>%s</h3></head>\n' % title)
+        fh.write('<body><p/><table cellpadding="2">\n')
+        fh.write('<tr><th>Size</th><th>Name</th></tr>\n')
+        for index, fname in enumerate(sorted(os.listdir(dir))):
+            if index % 2 == 0:
+                bgcolor = '#D8D8D8'
+            else:
+                bgcolor = '#FFFFFF'
+            try:
+                size = str(os.path.getsize(os.path.join(dir, fname)))
+            except:
+                size = 'unknown'
+            link = '<a href="%s" type="text/plain">%s</a>\n' % (fname, fname)
+            fh.write('<tr bgcolor="%s"><td>%s</td><td>%s</td></tr>\n' % (bgcolor, size, link))
+        fh.write('</table></body></html>\n')