diff beast.xml @ 0:d67268158946 draft

planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
author bcclaywell
date Mon, 12 Oct 2015 17:43:33 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/beast.xml	Mon Oct 12 17:43:33 2015 -0400
@@ -0,0 +1,486 @@
+<tool id="ARGO_beast_run" name="Run BEAST" version="1.0.0">
+  <description>Perform a discrete trait ancestral reconstruction analysis using BEAST</description>
+  <requirements>
+    <requirement type="package" version="0.1">argo_env</requirement>
+  </requirements>
+  <macros>
+    <import>macros.xml</import>
+  </macros>
+  <command interpreter="bash">
+    beast.sh ${config}
+  </command>
+  <stdio>
+    <expand macro="basic_errors"/>
+  </stdio>
+  <inputs>
+
+
+    <!-- BASIC DATA SPECIFICATION -->
+    <!-- ======================== -->
+    <conditional name="alignment">
+      <param name="specification" type="select" label="Alignment specification"
+        help="You can either specify your own alignment data, or use the alignment data in the BEAST config file below.">
+        <option value="file" selected="true">Alignment file</option>
+        <option value="in_beastfile">Use alignment in BEAST config file</option>
+      </param>
+      <when value="file">
+        <param name="file" type="data" format="fasta" label="Sequence alignment"
+          help="This is optional if you want to use the alignment in the BEAST config file."/>
+      </when>
+      <when value="in_beastfile"/>
+    </conditional>
+
+    <conditional name="metadata">
+      <param name="specification" type="select" label="Metadata specification"
+        help="Argo Navis needs to know how sequences split into demes, and whether there is date information.
+        In this menu, you can specify whether you want to use a CSV metadata file, or regular expressions for extracting this data from sequence names.">
+        <option value="file" selected="true">Metadata file</option>
+        <option value="regex">From sequence names</option>
+        <option value="in_beastfile">Use mappings in BEAST config file</option>
+      </param>
+      <when value="file">
+        <param name="file" type="data" format="csv" label="Metadata file"
+          help="CSV file specifying deme and (optionally) date data."/>
+        <param name="deme_column" type="text" label="Deme column" value="deme"
+          help="Column in CSV file to treat as the deme specification"/>
+        <param name="date_column" type="text" label="Date column" value="" optional="true"
+          help="Column in CSV file to treat as the date data. If left blank, all tips are set to time 0 by BEAST."/>
+      </when>
+      <!--XXX - Have to tune these regular expressions-->
+      <when value="regex">
+        <param name="deme_regex" type="text" value="^[^\s\|]+\|([^\|\s]+)" label="Deme name regular expression"
+          help="Defaults to second value in a 2-tuple or triple of '|' separated values.
+          For help working with regular expressions, see rubular.com.
+          Use () to capture the part of the match you want to extract as the deme name.">
+          <sanitizer>
+            <valid initial="string.printable">
+             <remove value="&apos;"/>
+            </valid>
+            <mapping initial="none">
+              <add source="&apos;" target="__sq__"/>
+            </mapping>
+          </sanitizer>
+        </param>
+        <param name="date_regex" type="text" value="^[^\s\|]+\|(?:[^\|\s]+)\|([\d]+\.?[\d]*)"
+          label="Date regular expression" optional="true"
+          help="This regular expression should extract an integer value, which can be interpreted as a time value.
+          Whether it is years, months, days (etc.) is up to you.">
+          <sanitizer>
+            <valid initial="string.printable">
+             <remove value="&apos;"/>
+            </valid>
+            <mapping initial="none">
+              <add source="&apos;" target="__sq__"/>
+            </mapping>
+          </sanitizer>
+        </param>
+      </when>
+      <when value="in_beastfile">
+        <!--Nothing-->
+      </when>
+    </conditional>
+
+    <param name="samples" type="integer" value="10000" label="Number of samples"
+      help="Number of samples to take."/>
+    <!--XXX Will have to check to make sure logging frequency is the same between -resume runs-->
+    <param name="sampling_interval" type="integer" value="1000" label="Sampling interval"
+      help="Number of states explored in chain between samples."/>
+    <param name="random_seed" type="integer" value="" label="Random seed" optional="true"
+      help="Random seed to be used in BEAST MCMC. Specifying and recording this value can aid in reproducibility."/>
+
+
+    <!-- MANUAL BEASTFILE SPECIFICATION -->
+    <!-- ============================== -->
+    <conditional name="beastfile">
+      <param name="specification" type="select" label="BEAST file specification"
+        help="If you'd like to run a more custom analysis, you can specify your own BEAST file (see detailed help below).
+        Additionally, if you are performing a resume run, you will need to specify the BEAST file output by the previous run.">
+        <option value="default" selected="true">Default</option>
+        <option value="custom">Custom or resume</option>
+      </param>
+      <when value="default"/>
+      <when value="custom">
+        <param name="template" type="data" format="beastfile" label="BEAST config file"
+          help="Note that any alignment or deme/date data specified above will override whatever is present in this file.
+          However, having this data specified in a custom file means you can forgo the inputs above."/>
+      </when>
+    </conditional>
+
+
+    <!-- HERE THERE BE DOWNSAMPLING OPTIONS -->
+    <!-- ================================== -->
+    <conditional name="downsampling">
+      <param name="method" type="select" display="radio" label="Downsampling method"
+        help="Downsampling sets a maximum number of sequences per deme.
+        This can help you investigate issues of sampling bias and sampling depth.
+        Random downsampling is recommended for concerns relating to sampling depth, while K-means is recommended for
+        addressing sampling bias.
+        Please see the help below for a more detailed information.">
+        <option value="none" selected="true">None</option>
+        <option value="random">Random</option>
+        <option value="kmeans">K-means</option>
+      </param>
+      <when value="none">
+        <!--Do nothing-->
+      </when>
+      <when value="random">
+        <param name="k" type="integer" value="" label="N sequences"
+          help="Number of sequences to be taken per deme."/>
+        <param name="random_seed" type="integer" value="" label="Random seed" optional="true"
+          help="Random seed to be used for sequence selections."/>
+      </when>
+      <when value="kmeans">
+        <param name="k" type="integer" value="" label="N sequences"
+          help="Number of sequences to be taken per deme."/>
+        <param name="random_seed" type="integer" value="" label="Random seed" optional="true"
+          help="Random seed to be used for sequence selections."/>
+      </when>
+    </conditional>
+
+
+    <!-- RESUME RUN OPTIONS -->
+    <!-- ================== -->
+    <conditional name="resume">
+      <param name="selector" type="select" label="Resume from a previous run?"
+        help="If you ran BEAST but found it didn't run long enough, you can resume your run from where you left off.
+        Note you will need to specify some of the previous run's outputs here and in the 'BEAST file specification' above,
+        and the only other parameters that will have any effect on the run will be the number of samples.
+        Note also this probably isn't worth doing if the previous run didn't take very long.">
+        <option value="false" selected="true">No thanks</option>
+        <option value="true">Yes please</option>
+      </param>
+      <when value="false">
+        <!--Do nothing-->
+      </when>
+      <when value="true">
+        <param name="logfile" type="data" format="full_logfile" label="Logfile from last run"
+          help="Cannot be the 'trimmed' output from previous run, but must be the full output."/>
+        <param name="treefile" type="data" format="full_treefile" label="Treefile from last run"
+          help="As with the Logfile, cannot be the `trimmed` output."/>
+        <param name="statefile" type="data" format="statefile" label="State file from previous run."/>
+        <param name="samples" type="integer" value="10000" label="Number of samples to keep"
+          help="With resume runs, the log and tree files can grow quite large, bogging down PACT and other analysis tools.
+          Therefore, after a resume run, 'trimmed' output files are created with this number of samples."/>
+      </when>
+    </conditional>
+
+  </inputs>
+
+  <outputs>
+    <data name="logfile" format="full_logfile" label="BEAST logfile"/>
+    <data name="treefile" format="full_treefile" label="BEAST treefile"/>
+    <data name="statefile" format="statefile" label="BEAST state file"/>
+    <data name="ess" format="html" label="BEAST effective sample size stats"/>
+
+    <data name="formatted_beastfile" format="beastfile" label="BEAST config file">
+      <filter>resume['selector'] != "true"</filter>
+    </data>
+    <data name="downsampled_alignment" format="fasta" label="Downsampled alignment">
+      <filter>downsampling['method'] != "none"</filter>
+    </data>
+    <data name="downsampled_metadata" format="csv" label="Downsampled metadata">
+      <filter>downsampling['method'] != "none" and metadata['specification'] == "file"</filter>
+    </data>
+    <data name="trimmed_logfile" format="trimmed_logfile" label="Trimmed BEAST logfile">
+      <filter>resume['selector'] == "true"</filter>
+    </data>
+    <data name="trimmed_treefile" format="trimmed_treefile" label="Trimmed BEAST treefile">
+      <filter>resume['selector'] == "true"</filter>
+    </data>
+  </outputs>
+
+  <configfiles>
+    <!-- XXX Not sure if these deme specifications will actually work or not ??? -->
+    <configfile name="config">
+
+## Specification of alignment
+#if $alignment.specification == "file"
+ALIGNMENT="${alignment.file}"
+#else
+ALIGNMENT=""
+#end if
+
+
+METADATA_SPECIFICATION="${metadata.specification}"
+#if $metadata.specification == "file"
+METADATA_FILE="${metadata.file}"
+DEME_COLUMN="${metadata.deme_column}"
+#if str($metadata.date_column) not in ["None", ""]
+DATE_COLUMN="${metadata.date_column}"
+#else
+DATE_COLUMN=""
+#end if
+#elif $metadata.specification == "regex"
+DEME_REGEX='${metadata.deme_regex}'
+#if str($metadata.date_regex) != 'None'
+DATE_REGEX='${metadata.date_regex}'
+#else
+DATE_REGEX=''
+#end if
+#else
+METADATA_FILE=""
+DEME_REGEX=""
+DEME_COLUMN=""
+DATE_COLUMN=""
+DATE_REGEX=""
+#end if
+
+SAMPLES_FLAG="-s ${samples}"
+SAMPLING_INTERVAL_FLAG="-i ${sampling_interval}"
+#if str($random_seed) not in ["None", ""]
+RANDOM_SEED_FLAG="-seed ${random_seed}"
+#else
+RANDOM_SEED_FLAG=""
+#end if
+
+BEASTFILE_SPECIFICATION="${beastfile.specification}"
+#if $beastfile.specification == "custom"
+BEASTFILE_TEMPLATE="${beastfile.template}"
+#end if
+
+RESUME_SELECTOR="${resume.selector}"
+#if $resume.selector == "true"
+RESUME_LOGFILE="${resume.logfile}"
+RESUME_TREEFILE="${resume.treefile}"
+RESUME_STATEFILE="${resume.statefile}"
+RESUME_SAMPLES="${resume.samples}"
+#end if
+
+DOWNSAMPLING_METHOD="${downsampling.method}"
+#if $downsampling.method != "none"
+DOWNSAMPLING_K="${downsampling.k}"
+#if str($downsampling.random_seed) != "None"
+DOWNSAMPLING_RANDOM_SEED="${downsampling.random_seed}"
+#end if
+#end if
+
+
+
+LOGFILE="${logfile}"
+TREEFILE="${treefile}"
+ESS="${ess}"
+
+#if $resume.selector != "true"
+FORMATTED_BEASTFILE="${formatted_beastfile}"
+#else
+FORMATTED_BEASTFILE="formatted_beastfile.xml"
+#end if
+STATEFILE="${statefile}"
+
+#if $downsampling.method != "none"
+DOWNSAMPLED_ALIGNMENT="${downsampled_alignment}"
+#if $metadata.specification == "file"
+DOWNSAMPLED_METADATA="${downsampled_metadata}"
+#end if
+#end if
+
+#if $resume.selector == "true"
+TRIMMED_LOGFILE="${trimmed_logfile}"
+TRIMMED_TREEFILE="${trimmed_treefile}"
+#end if
+
+    </configfile>
+  </configfiles>
+
+  <!-- The contents of the help tag is parsed as reStructuredText. Please see
+       help-template.rst for examples of commonly-used sections in other Galaxy
+       tools. -->
+  <help>
+
+.. class:: infomark
+
+
+About this tool
+---------------
+
+This tool is your starting point for running an Argo Navis analysis.
+It runs BEAST for you, and produces a treefile suitable for running through the Argo Navis PACT tool, which facilitates
+visualization and analyses of your data.
+
+
+Input files
+-----------
+
+To run this tool, you'll need some sequence data, and a way to specify which sequences belong to which demes.
+The recommended way of doing this is to specify a FASTA file of aligned sequences, and a separate metadata CSV file with
+deme specifications.
+This metadata file may also be used to specify temporal information for your sequences.
+While the column names pointing to the deme and date information are configurable, there must be a "sequence" column
+corresponding to the names of the sequences in the FASTA file.
+Additionally, there must be no sequences represented in one file not represented in the other.
+
+It is also possible to specify deme and date information directly in the FASTA file sequence names using regular expressions.
+To do this, select "From  sequence names" from the "Metadata specification" dropdown.
+This will open up options for deme and date regular expressions which will be used to extract the information from
+the sequence names.
+The default behaviour is to parse the sequence names as "|" separated values like this: "name|deme|date".
+If you'd like help customizing these regular expressions, please see the `regex tutorial`_ and `rubular`_.
+
+For more custom analyses, you can also specify your own BEAST config file.
+This lets you customize model details, priors, and parameters.
+If you wish to do this, take a look at the **Customizing your BEAST config file** section below.
+
+
+Output files
+------------
+
+The following files are created as the result of an Argo Navis BEAST run:
+
+**BEAST logfile**: A standard BEAST logfile of sampled parameters, suitable for analysis with Tracer.
+
+**BEAST treefile**: A nexus file of trees from the posterior, with ancestral states labeled, suitable for
+analysis with PACT.
+
+**BEAST effective sample size stats**: Statistics about whether BEAST has run long enough.
+You can click on the eye icon next to this file to see this data in-browser.
+
+Once you've carried out a run, you should always check the ESS statistics before doing anything else.
+This file will contain a "Recommendation", indicating whether the data is ready to use, or whether you
+should run BEAST longer, and if so, how much longer.
+
+While the ESS statistic is an easy way to get a sense for whether your should run BEAST more or not, it's also
+a good idea to manually review the logfile using `Tracer`_, before accepting the results as final and
+passing them along through the rest of the analysis.
+Please see the tutorial in the Introduction for a description of how to do this.
+
+
+Resume runs
+-----------
+
+If you run BEAST and realize it needs to run longer, you can save time by doing a "Resume" run.
+This let's you pick up from where you left off, keeping you from having to start over.
+
+Note that in addition to the output files listed above, an initial run of BEAST will also produce the following:
+
+**BEAST config file**: A copy of the BEAST file created for the run.
+This has been modified to include sequence, deme, and date information, as well as various other run settings.
+
+**BEAST state file**: This is a file that BEAST maintains as it runs so Resume runs can pick up where they left off.
+
+In order to start a Resume run, start by entering the number of new samples you'd like to add to the data you've already collected.
+Next, in the "BEAST file specification" input, select "Custom or Resume".
+This will open up a file input from which you can select the BEAST config file from the previous run.
+Next, select "Yes" in the "Resume from a previous run?" form input.
+This will open up several new input fields where you can specify the logfile, treefile and statefile from your previous run.
+
+You can also specify a total number of samples you want to keep.
+When running a Resume run, your log and tree files can grow large enough to bog down analysis tools.
+This problem is dealt with by evenly trimming these files to the specified number of samples during Resume runs.
+It's important to note, however, that if you do further Resume runs, you must make sure to specify the **untrimmed** log and
+tree files for the following runs.
+If you don't, bad things will happen (someone will kill a puppy; seriously).
+
+It's also important to note that Resume runs ignore all inputs not explicitly mentioned above.
+In particular, you can't change the sequences, the date information, deme specifications or sampling interval for a Resume run.
+If you *do* want to change any of this, you must start a new run.
+
+
+Random seeds
+------------
+
+Both the actual BEAST run and the downsampling methods support specification of random seeds.
+Taking advantage of this aids in reproducibility of your analyses, as someone else running with the same data, settings and
+random seeds should be able to reproduce the results exactly.
+If you choose not to specify your own random seeds, random seeds will be chosen for you and printed out in the logs, so
+if you wish to go back and find these values, you can.
+
+
+Downsampling
+------------
+
+Differences in sampling between demes you study can be a challenge when doing these analyses.
+How do you know that one deme having been sampled more deeply isn't artificially biasing the results towards increased
+diversity for that deme?
+It's also frequently the case that researchers studying viruses don't submit every viral sequence they obtain, but decide
+which look "different enough" from the others (and from those already observed) to submit to online repositories.
+And typically, the strategy used for deciding which sequences to submit and which not to is not made clear in publications.
+As such, treating sequences found in these repositories as an unbiased sample is problematic, and has even greater
+potential for introducing diversity bias into your results.
+
+Issues such as these frequently come up in reviewer critiques.
+One way to deal with this is to show evidence that the sampling issues aren't significantly affecting the results,
+or at the very least, to explore the effects of different sampling strategies on the results.
+Argo Navis provides downsampling tools which help you towards this end.
+
+There are two downsampling methods you can use: Random and K-means.
+Random downsampling is meant to deal with issues of only sampling depth, while K-means downsampling is meant to
+address issues of sampling bias.
+While Random downsampling is likely fairly straight-forward, K-means downsampling may not be.
+
+K-means downsampling starts by clustering sequences together into K clusters.
+From each of these clusters a single sequence is chosen to represent the entire cluster.
+This is done for each deme.
+This method introduces intentional, but controlled sampling bias, as for any given number of K samples per
+deme, the sequences chosen are going to reflect the *most* diversity possible among those sequences.
+While this doesn't *solve* the problem of sampling bias, it does attempt to put all demes on equal footing, by
+introducing similar diversity bias in all sequences.
+
+A decent strategy for protecting yourself from reviewers is to use *both* of these methods at various K values,
+and compare the results.
+Assuming the results look similar in all cases, then great!
+It would appear you have a very robust set of conclusions.
+If there are differences, there will be a bit more work involved in explaining them, but at least you'll be able to say
+you've thoroughly explored the issue.
+
+On a final note, if you *do* decide to use BEAST's downsampling functionality, you will obtain a couple of additional
+outputs:
+
+**Downsampled alignment**: The sequences kept in the downsampling process, in FASTA format.
+
+**Downsampled metadata**: If you supplied metadata for the analysis, you will also get a copy of that file containing
+metadata for only the sequences kept in the downsampling procedure.
+
+These are not required for any other part of Argo Navis, but are offered for your convenience so you can see which
+sequences were chosen and use the same data subsets in other analyses you might perform outside of Argo Navis.
+
+
+Customizing your BEAST config file
+----------------------------------
+
+Customizing your BEAST config file opens up the full power of BEAST, letting you specify model details, priors, and
+other parameters.
+If you wish to do this, it's recommended you start by working from this tool's `default BEAST config file`_.
+From there, you can specify customizations using the BEAUti program, which is installed alongside BEAST2 (see the
+`BEAST2 homepage`_ for downloads).
+
+Before you can load the config file, you'll have to install the BEAST Classic module from within BEAUti.
+Simply go to File > Manage Packages, click on "BEAST_CLASSIC", then click "Install".
+You can now go to File > Load to select the default BEAST config file.
+Once you've made your edits, save your new config file by going to File > Save As.
+You can then upload the file into Galaxy by going to Get Data > Upload File within the Galaxy Tool menu, making sure to
+specify "beastfile" as the file type.
+
+Once you have your file loaded, select "Custom or resume" from "BEAST file specification", and point the file selector that pops
+up to the new config file you just uploaded.
+If you wish, you can also tell Argo Navis to use the alignment and deme data in the BEAST config file by selecting the
+appropriate options in the "Alignment specification" and "Metadata specification" dropdowns.
+If you don't do this, whatever files/options are specified for Sequence alignment and Metadata specification will replace the
+data specified in your BEAST config file.
+
+Some notes to keep in mind if you specify your own BEAST config:
+
+1. If you choose to specify the deme/community information directly in the your BEAST file and not apply the CSV or regular
+   expression data to set this, you must name your discrete trait "deme", even if it refers to something else, like
+   species name or tissue type.
+2. Note that the chain length and sampling interval settings in your BEAST config file will get overridden by whatever is
+   specified here in this tool, and can not be assumed to remain as specified in your BEAST config file.
+3. If you don't have tip dates, you can speed up your run by specifying a custom beast file where your fixed clock's rate isn't
+   estimated but left constant. If you don't have any temporal data, these estimates won't mean anything anyway.
+
+Note that the default template closely follows the setup in the `Ancestral State Reconstruction tutorial`_ by Remco Bouckaert,
+which you can use as a guide for setting up your own custom BEAST analysis.
+The `BEAST homepage`_ is also a useful resource if you're looking for help settings things up.
+
+
+
+.. _BEAST2 homepage: http://www.beast2.org/
+.. _default BEAST config file: http://xxx.doesntexist.yet
+.. _regex tutorial: http://regexone.com/lesson/0
+.. _rubular: http:rubular.com
+.. _Tracer: http://tree.bio.ed.ac.uk/software/tracer/
+.. _Ancestral State Reconstruction tutorial: http://beast-classic.googlecode.com/files/ARv2.0.1.pdf
+.. _BEAST homepage: http://beast.bio.ed.ac.uk/
+
+
+  </help>
+</tool>