Mercurial > repos > bcclaywell > argo_navis

<tool id="ARGO_beast_run" name="Run BEAST" version="1.0.0">
  <description>Perform a discrete trait ancestral reconstruction analysis using BEAST</description>
  <requirements>
    <requirement type="package" version="0.1">argo_env</requirement>
  </requirements>
  <macros>
    <import>macros.xml</import>
  </macros>
  <command interpreter="bash">
    beast.sh ${config}
  </command>
  <stdio>
    <expand macro="basic_errors"/>
  </stdio>
  <inputs>


    <!-- BASIC DATA SPECIFICATION -->
    <!-- ======================== -->
    <conditional name="alignment">
      <param name="specification" type="select" label="Alignment specification"
        help="You can either specify your own alignment data, or use the alignment data in the BEAST config file below.">
        <option value="file" selected="true">Alignment file</option>
        <option value="in_beastfile">Use alignment in BEAST config file</option>
      </param>
      <when value="file">
        <param name="file" type="data" format="fasta" label="Sequence alignment"
          help="This is optional if you want to use the alignment in the BEAST config file."/>
      </when>
      <when value="in_beastfile"/>
    </conditional>

    <conditional name="metadata">
      <param name="specification" type="select" label="Metadata specification"
        help="Argo Navis needs to know how sequences split into demes, and whether there is date information.
        In this menu, you can specify whether you want to use a CSV metadata file, or regular expressions for extracting this data from sequence names.">
        <option value="file" selected="true">Metadata file</option>
        <option value="regex">From sequence names</option>
        <option value="in_beastfile">Use mappings in BEAST config file</option>
      </param>
      <when value="file">
        <param name="file" type="data" format="csv" label="Metadata file"
          help="CSV file specifying deme and (optionally) date data."/>
        <param name="deme_column" type="text" label="Deme column" value="deme"
          help="Column in CSV file to treat as the deme specification"/>
        <param name="date_column" type="text" label="Date column" value="" optional="true"
          help="Column in CSV file to treat as the date data. If left blank, all tips are set to time 0 by BEAST."/>
      </when>
      <!--XXX - Have to tune these regular expressions-->
      <when value="regex">
        <param name="deme_regex" type="text" value="^[^\s\|]+\|([^\|\s]+)" label="Deme name regular expression"
          help="Defaults to second value in a 2-tuple or triple of '|' separated values.
          For help working with regular expressions, see rubular.com.
          Use () to capture the part of the match you want to extract as the deme name.">
          <sanitizer>
            <valid initial="string.printable">
             <remove value="&apos;"/>
            </valid>
            <mapping initial="none">
              <add source="&apos;" target="__sq__"/>
            </mapping>
          </sanitizer>
        </param>
        <param name="date_regex" type="text" value="^[^\s\|]+\|(?:[^\|\s]+)\|([\d]+\.?[\d]*)"
          label="Date regular expression" optional="true"
          help="This regular expression should extract an integer value, which can be interpreted as a time value.
          Whether it is years, months, days (etc.) is up to you.">
          <sanitizer>
            <valid initial="string.printable">
             <remove value="&apos;"/>
            </valid>
            <mapping initial="none">
              <add source="&apos;" target="__sq__"/>
            </mapping>
          </sanitizer>
        </param>
      </when>
      <when value="in_beastfile">
        <!--Nothing-->
      </when>
    </conditional>

    <param name="samples" type="integer" value="10000" label="Number of samples"
      help="Number of samples to take."/>
    <!--XXX Will have to check to make sure logging frequency is the same between -resume runs-->
    <param name="sampling_interval" type="integer" value="1000" label="Sampling interval"
      help="Number of states explored in chain between samples."/>
    <param name="random_seed" type="integer" value="" label="Random seed" optional="true"
      help="Random seed to be used in BEAST MCMC. Specifying and recording this value can aid in reproducibility."/>


    <!-- MANUAL BEASTFILE SPECIFICATION -->
    <!-- ============================== -->
    <conditional name="beastfile">
      <param name="specification" type="select" label="BEAST file specification"
        help="If you'd like to run a more custom analysis, you can specify your own BEAST file (see detailed help below).
        Additionally, if you are performing a resume run, you will need to specify the BEAST file output by the previous run.">
        <option value="default" selected="true">Default</option>
        <option value="custom">Custom or resume</option>
      </param>
      <when value="default"/>
      <when value="custom">
        <param name="template" type="data" format="beastfile" label="BEAST config file"
          help="Note that any alignment or deme/date data specified above will override whatever is present in this file.
          However, having this data specified in a custom file means you can forgo the inputs above."/>
      </when>
    </conditional>


    <!-- HERE THERE BE DOWNSAMPLING OPTIONS -->
    <!-- ================================== -->
    <conditional name="downsampling">
      <param name="method" type="select" display="radio" label="Downsampling method"
        help="Downsampling sets a maximum number of sequences per deme.
        This can help you investigate issues of sampling bias and sampling depth.
        Random downsampling is recommended for concerns relating to sampling depth, while K-means is recommended for
        addressing sampling bias.
        Please see the help below for a more detailed information.">
        <option value="none" selected="true">None</option>
        <option value="random">Random</option>
        <option value="kmeans">K-means</option>
      </param>
      <when value="none">
        <!--Do nothing-->
      </when>
      <when value="random">
        <param name="k" type="integer" value="" label="N sequences"
          help="Number of sequences to be taken per deme."/>
        <param name="random_seed" type="integer" value="" label="Random seed" optional="true"
          help="Random seed to be used for sequence selections."/>
      </when>
      <when value="kmeans">
        <param name="k" type="integer" value="" label="N sequences"
          help="Number of sequences to be taken per deme."/>
        <param name="random_seed" type="integer" value="" label="Random seed" optional="true"
          help="Random seed to be used for sequence selections."/>
      </when>
    </conditional>


    <!-- RESUME RUN OPTIONS -->
    <!-- ================== -->
    <conditional name="resume">
      <param name="selector" type="select" label="Resume from a previous run?"
        help="If you ran BEAST but found it didn't run long enough, you can resume your run from where you left off.
        Note you will need to specify some of the previous run's outputs here and in the 'BEAST file specification' above,
        and the only other parameters that will have any effect on the run will be the number of samples.
        Note also this probably isn't worth doing if the previous run didn't take very long.">
        <option value="false" selected="true">No thanks</option>
        <option value="true">Yes please</option>
      </param>
      <when value="false">
        <!--Do nothing-->
      </when>
      <when value="true">
        <param name="logfile" type="data" format="full_logfile" label="Logfile from last run"
          help="Cannot be the 'trimmed' output from previous run, but must be the full output."/>
        <param name="treefile" type="data" format="full_treefile" label="Treefile from last run"
          help="As with the Logfile, cannot be the `trimmed` output."/>
        <param name="statefile" type="data" format="statefile" label="State file from previous run."/>
        <param name="samples" type="integer" value="10000" label="Number of samples to keep"
          help="With resume runs, the log and tree files can grow quite large, bogging down PACT and other analysis tools.
          Therefore, after a resume run, 'trimmed' output files are created with this number of samples."/>
      </when>
    </conditional>

  </inputs>

  <outputs>
    <data name="logfile" format="full_logfile" label="BEAST logfile"/>
    <data name="treefile" format="full_treefile" label="BEAST treefile"/>
    <data name="statefile" format="statefile" label="BEAST state file"/>
    <data name="ess" format="html" label="BEAST effective sample size stats"/>

    <data name="formatted_beastfile" format="beastfile" label="BEAST config file">
      <filter>resume['selector'] != "true"</filter>
    </data>
    <data name="downsampled_alignment" format="fasta" label="Downsampled alignment">
      <filter>downsampling['method'] != "none"</filter>
    </data>
    <data name="downsampled_metadata" format="csv" label="Downsampled metadata">
      <filter>downsampling['method'] != "none" and metadata['specification'] == "file"</filter>
    </data>
    <data name="trimmed_logfile" format="trimmed_logfile" label="Trimmed BEAST logfile">
      <filter>resume['selector'] == "true"</filter>
    </data>
    <data name="trimmed_treefile" format="trimmed_treefile" label="Trimmed BEAST treefile">
      <filter>resume['selector'] == "true"</filter>
    </data>
  </outputs>

  <configfiles>
    <!-- XXX Not sure if these deme specifications will actually work or not ??? -->
    <configfile name="config">

## Specification of alignment
#if $alignment.specification == "file"
ALIGNMENT="${alignment.file}"
#else
ALIGNMENT=""
#end if


METADATA_SPECIFICATION="${metadata.specification}"
#if $metadata.specification == "file"
METADATA_FILE="${metadata.file}"
DEME_COLUMN="${metadata.deme_column}"
#if str($metadata.date_column) not in ["None", ""]
DATE_COLUMN="${metadata.date_column}"
#else
DATE_COLUMN=""
#end if
#elif $metadata.specification == "regex"
DEME_REGEX='${metadata.deme_regex}'
#if str($metadata.date_regex) != 'None'
DATE_REGEX='${metadata.date_regex}'
#else
DATE_REGEX=''
#end if
#else
METADATA_FILE=""
DEME_REGEX=""
DEME_COLUMN=""
DATE_COLUMN=""
DATE_REGEX=""
#end if

SAMPLES_FLAG="-s ${samples}"
SAMPLING_INTERVAL_FLAG="-i ${sampling_interval}"
#if str($random_seed) not in ["None", ""]
RANDOM_SEED_FLAG="-seed ${random_seed}"
#else
RANDOM_SEED_FLAG=""
#end if

BEASTFILE_SPECIFICATION="${beastfile.specification}"
#if $beastfile.specification == "custom"
BEASTFILE_TEMPLATE="${beastfile.template}"
#end if

RESUME_SELECTOR="${resume.selector}"
#if $resume.selector == "true"
RESUME_LOGFILE="${resume.logfile}"
RESUME_TREEFILE="${resume.treefile}"
RESUME_STATEFILE="${resume.statefile}"
RESUME_SAMPLES="${resume.samples}"
#end if

DOWNSAMPLING_METHOD="${downsampling.method}"
#if $downsampling.method != "none"
DOWNSAMPLING_K="${downsampling.k}"
#if str($downsampling.random_seed) != "None"
DOWNSAMPLING_RANDOM_SEED="${downsampling.random_seed}"
#end if
#end if


LOGFILE="${logfile}"
TREEFILE="${treefile}"
ESS="${ess}"

#if $resume.selector != "true"
FORMATTED_BEASTFILE="${formatted_beastfile}"
#else
FORMATTED_BEASTFILE="formatted_beastfile.xml"
#end if
STATEFILE="${statefile}"

#if $downsampling.method != "none"
DOWNSAMPLED_ALIGNMENT="${downsampled_alignment}"
#if $metadata.specification == "file"
DOWNSAMPLED_METADATA="${downsampled_metadata}"
#end if
#end if

#if $resume.selector == "true"
TRIMMED_LOGFILE="${trimmed_logfile}"
TRIMMED_TREEFILE="${trimmed_treefile}"
#end if

    </configfile>
  </configfiles>

  <!-- The contents of the help tag is parsed as reStructuredText. Please see
       help-template.rst for examples of commonly-used sections in other Galaxy
       tools. -->
  <help>

.. class:: infomark


About this tool
---------------

This tool is your starting point for running an Argo Navis analysis.
It runs BEAST for you, and produces a treefile suitable for running through the Argo Navis PACT tool, which facilitates
visualization and analyses of your data.


Input files
-----------

To run this tool, you'll need some sequence data, and a way to specify which sequences belong to which demes.
The recommended way of doing this is to specify a FASTA file of aligned sequences, and a separate metadata CSV file with
deme specifications.
This metadata file may also be used to specify temporal information for your sequences.
While the column names pointing to the deme and date information are configurable, there must be a "sequence" column
corresponding to the names of the sequences in the FASTA file.
Additionally, there must be no sequences represented in one file not represented in the other.

It is also possible to specify deme and date information directly in the FASTA file sequence names using regular expressions.
To do this, select "From  sequence names" from the "Metadata specification" dropdown.
This will open up options for deme and date regular expressions which will be used to extract the information from
the sequence names.
The default behaviour is to parse the sequence names as "|" separated values like this: "name|deme|date".
If you'd like help customizing these regular expressions, please see the `regex tutorial`_ and `rubular`_.

For more custom analyses, you can also specify your own BEAST config file.
This lets you customize model details, priors, and parameters.
If you wish to do this, take a look at the **Customizing your BEAST config file** section below.


Output files
------------

The following files are created as the result of an Argo Navis BEAST run:

**BEAST logfile**: A standard BEAST logfile of sampled parameters, suitable for analysis with Tracer.

**BEAST treefile**: A nexus file of trees from the posterior, with ancestral states labeled, suitable for
analysis with PACT.

**BEAST effective sample size stats**: Statistics about whether BEAST has run long enough.
You can click on the eye icon next to this file to see this data in-browser.

Once you've carried out a run, you should always check the ESS statistics before doing anything else.
This file will contain a "Recommendation", indicating whether the data is ready to use, or whether you
should run BEAST longer, and if so, how much longer.

While the ESS statistic is an easy way to get a sense for whether your should run BEAST more or not, it's also
a good idea to manually review the logfile using `Tracer`_, before accepting the results as final and
passing them along through the rest of the analysis.
Please see the tutorial in the Introduction for a description of how to do this.


Resume runs
-----------

If you run BEAST and realize it needs to run longer, you can save time by doing a "Resume" run.
This let's you pick up from where you left off, keeping you from having to start over.

Note that in addition to the output files listed above, an initial run of BEAST will also produce the following:

**BEAST config file**: A copy of the BEAST file created for the run.
This has been modified to include sequence, deme, and date information, as well as various other run settings.

**BEAST state file**: This is a file that BEAST maintains as it runs so Resume runs can pick up where they left off.

In order to start a Resume run, start by entering the number of new samples you'd like to add to the data you've already collected.
Next, in the "BEAST file specification" input, select "Custom or Resume".
This will open up a file input from which you can select the BEAST config file from the previous run.
Next, select "Yes" in the "Resume from a previous run?" form input.
This will open up several new input fields where you can specify the logfile, treefile and statefile from your previous run.

You can also specify a total number of samples you want to keep.
When running a Resume run, your log and tree files can grow large enough to bog down analysis tools.
This problem is dealt with by evenly trimming these files to the specified number of samples during Resume runs.
It's important to note, however, that if you do further Resume runs, you must make sure to specify the **untrimmed** log and
tree files for the following runs.
If you don't, bad things will happen (someone will kill a puppy; seriously).

It's also important to note that Resume runs ignore all inputs not explicitly mentioned above.
In particular, you can't change the sequences, the date information, deme specifications or sampling interval for a Resume run.
If you *do* want to change any of this, you must start a new run.


Random seeds
------------

Both the actual BEAST run and the downsampling methods support specification of random seeds.
Taking advantage of this aids in reproducibility of your analyses, as someone else running with the same data, settings and
random seeds should be able to reproduce the results exactly.
If you choose not to specify your own random seeds, random seeds will be chosen for you and printed out in the logs, so
if you wish to go back and find these values, you can.


Downsampling
------------

Differences in sampling between demes you study can be a challenge when doing these analyses.
How do you know that one deme having been sampled more deeply isn't artificially biasing the results towards increased
diversity for that deme?
It's also frequently the case that researchers studying viruses don't submit every viral sequence they obtain, but decide
which look "different enough" from the others (and from those already observed) to submit to online repositories.
And typically, the strategy used for deciding which sequences to submit and which not to is not made clear in publications.
As such, treating sequences found in these repositories as an unbiased sample is problematic, and has even greater
potential for introducing diversity bias into your results.

Issues such as these frequently come up in reviewer critiques.
One way to deal with this is to show evidence that the sampling issues aren't significantly affecting the results,
or at the very least, to explore the effects of different sampling strategies on the results.
Argo Navis provides downsampling tools which help you towards this end.

There are two downsampling methods you can use: Random and K-means.
Random downsampling is meant to deal with issues of only sampling depth, while K-means downsampling is meant to
address issues of sampling bias.
While Random downsampling is likely fairly straight-forward, K-means downsampling may not be.

K-means downsampling starts by clustering sequences together into K clusters.
From each of these clusters a single sequence is chosen to represent the entire cluster.
This is done for each deme.
This method introduces intentional, but controlled sampling bias, as for any given number of K samples per
deme, the sequences chosen are going to reflect the *most* diversity possible among those sequences.
While this doesn't *solve* the problem of sampling bias, it does attempt to put all demes on equal footing, by
introducing similar diversity bias in all sequences.

A decent strategy for protecting yourself from reviewers is to use *both* of these methods at various K values,
and compare the results.
Assuming the results look similar in all cases, then great!
It would appear you have a very robust set of conclusions.
If there are differences, there will be a bit more work involved in explaining them, but at least you'll be able to say
you've thoroughly explored the issue.

On a final note, if you *do* decide to use BEAST's downsampling functionality, you will obtain a couple of additional
outputs:

**Downsampled alignment**: The sequences kept in the downsampling process, in FASTA format.

**Downsampled metadata**: If you supplied metadata for the analysis, you will also get a copy of that file containing
metadata for only the sequences kept in the downsampling procedure.

These are not required for any other part of Argo Navis, but are offered for your convenience so you can see which
sequences were chosen and use the same data subsets in other analyses you might perform outside of Argo Navis.


Customizing your BEAST config file
----------------------------------

Customizing your BEAST config file opens up the full power of BEAST, letting you specify model details, priors, and
other parameters.
If you wish to do this, it's recommended you start by working from this tool's `default BEAST config file`_.
From there, you can specify customizations using the BEAUti program, which is installed alongside BEAST2 (see the
`BEAST2 homepage`_ for downloads).

Before you can load the config file, you'll have to install the BEAST Classic module from within BEAUti.
Simply go to File > Manage Packages, click on "BEAST_CLASSIC", then click "Install".
You can now go to File > Load to select the default BEAST config file.
Once you've made your edits, save your new config file by going to File > Save As.
You can then upload the file into Galaxy by going to Get Data > Upload File within the Galaxy Tool menu, making sure to
specify "beastfile" as the file type.

Once you have your file loaded, select "Custom or resume" from "BEAST file specification", and point the file selector that pops
up to the new config file you just uploaded.
If you wish, you can also tell Argo Navis to use the alignment and deme data in the BEAST config file by selecting the
appropriate options in the "Alignment specification" and "Metadata specification" dropdowns.
If you don't do this, whatever files/options are specified for Sequence alignment and Metadata specification will replace the
data specified in your BEAST config file.

Some notes to keep in mind if you specify your own BEAST config:

1. If you choose to specify the deme/community information directly in the your BEAST file and not apply the CSV or regular
   expression data to set this, you must name your discrete trait "deme", even if it refers to something else, like
   species name or tissue type.
2. Note that the chain length and sampling interval settings in your BEAST config file will get overridden by whatever is
   specified here in this tool, and can not be assumed to remain as specified in your BEAST config file.
3. If you don't have tip dates, you can speed up your run by specifying a custom beast file where your fixed clock's rate isn't
   estimated but left constant. If you don't have any temporal data, these estimates won't mean anything anyway.

Note that the default template closely follows the setup in the `Ancestral State Reconstruction tutorial`_ by Remco Bouckaert,
which you can use as a guide for setting up your own custom BEAST analysis.
The `BEAST homepage`_ is also a useful resource if you're looking for help settings things up.


.. _BEAST2 homepage: http://www.beast2.org/
.. _default BEAST config file: http://xxx.doesntexist.yet
.. _regex tutorial: http://regexone.com/lesson/0
.. _rubular: http:rubular.com
.. _Tracer: http://tree.bio.ed.ac.uk/software/tracer/
.. _Ancestral State Reconstruction tutorial: http://beast-classic.googlecode.com/files/ARv2.0.1.pdf
.. _BEAST homepage: http://beast.bio.ed.ac.uk/


  </help>
</tool>
author	bcclaywell
date	Mon, 12 Oct 2015 17:57:38 -0400
parents	d67268158946
children