Mercurial > repos > bcclaywell > argo_navis
diff beast.xml @ 0:d67268158946 draft
planemo upload commit a3f181f5f126803c654b3a66dd4e83a48f7e203b
| author | bcclaywell |
|---|---|
| date | Mon, 12 Oct 2015 17:43:33 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/beast.xml Mon Oct 12 17:43:33 2015 -0400 @@ -0,0 +1,486 @@ +<tool id="ARGO_beast_run" name="Run BEAST" version="1.0.0"> + <description>Perform a discrete trait ancestral reconstruction analysis using BEAST</description> + <requirements> + <requirement type="package" version="0.1">argo_env</requirement> + </requirements> + <macros> + <import>macros.xml</import> + </macros> + <command interpreter="bash"> + beast.sh ${config} + </command> + <stdio> + <expand macro="basic_errors"/> + </stdio> + <inputs> + + + <!-- BASIC DATA SPECIFICATION --> + <!-- ======================== --> + <conditional name="alignment"> + <param name="specification" type="select" label="Alignment specification" + help="You can either specify your own alignment data, or use the alignment data in the BEAST config file below."> + <option value="file" selected="true">Alignment file</option> + <option value="in_beastfile">Use alignment in BEAST config file</option> + </param> + <when value="file"> + <param name="file" type="data" format="fasta" label="Sequence alignment" + help="This is optional if you want to use the alignment in the BEAST config file."/> + </when> + <when value="in_beastfile"/> + </conditional> + + <conditional name="metadata"> + <param name="specification" type="select" label="Metadata specification" + help="Argo Navis needs to know how sequences split into demes, and whether there is date information. + In this menu, you can specify whether you want to use a CSV metadata file, or regular expressions for extracting this data from sequence names."> + <option value="file" selected="true">Metadata file</option> + <option value="regex">From sequence names</option> + <option value="in_beastfile">Use mappings in BEAST config file</option> + </param> + <when value="file"> + <param name="file" type="data" format="csv" label="Metadata file" + help="CSV file specifying deme and (optionally) date data."/> + <param name="deme_column" type="text" label="Deme column" value="deme" + help="Column in CSV file to treat as the deme specification"/> + <param name="date_column" type="text" label="Date column" value="" optional="true" + help="Column in CSV file to treat as the date data. If left blank, all tips are set to time 0 by BEAST."/> + </when> + <!--XXX - Have to tune these regular expressions--> + <when value="regex"> + <param name="deme_regex" type="text" value="^[^\s\|]+\|([^\|\s]+)" label="Deme name regular expression" + help="Defaults to second value in a 2-tuple or triple of '|' separated values. + For help working with regular expressions, see rubular.com. + Use () to capture the part of the match you want to extract as the deme name."> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + <param name="date_regex" type="text" value="^[^\s\|]+\|(?:[^\|\s]+)\|([\d]+\.?[\d]*)" + label="Date regular expression" optional="true" + help="This regular expression should extract an integer value, which can be interpreted as a time value. + Whether it is years, months, days (etc.) is up to you."> + <sanitizer> + <valid initial="string.printable"> + <remove value="'"/> + </valid> + <mapping initial="none"> + <add source="'" target="__sq__"/> + </mapping> + </sanitizer> + </param> + </when> + <when value="in_beastfile"> + <!--Nothing--> + </when> + </conditional> + + <param name="samples" type="integer" value="10000" label="Number of samples" + help="Number of samples to take."/> + <!--XXX Will have to check to make sure logging frequency is the same between -resume runs--> + <param name="sampling_interval" type="integer" value="1000" label="Sampling interval" + help="Number of states explored in chain between samples."/> + <param name="random_seed" type="integer" value="" label="Random seed" optional="true" + help="Random seed to be used in BEAST MCMC. Specifying and recording this value can aid in reproducibility."/> + + + <!-- MANUAL BEASTFILE SPECIFICATION --> + <!-- ============================== --> + <conditional name="beastfile"> + <param name="specification" type="select" label="BEAST file specification" + help="If you'd like to run a more custom analysis, you can specify your own BEAST file (see detailed help below). + Additionally, if you are performing a resume run, you will need to specify the BEAST file output by the previous run."> + <option value="default" selected="true">Default</option> + <option value="custom">Custom or resume</option> + </param> + <when value="default"/> + <when value="custom"> + <param name="template" type="data" format="beastfile" label="BEAST config file" + help="Note that any alignment or deme/date data specified above will override whatever is present in this file. + However, having this data specified in a custom file means you can forgo the inputs above."/> + </when> + </conditional> + + + <!-- HERE THERE BE DOWNSAMPLING OPTIONS --> + <!-- ================================== --> + <conditional name="downsampling"> + <param name="method" type="select" display="radio" label="Downsampling method" + help="Downsampling sets a maximum number of sequences per deme. + This can help you investigate issues of sampling bias and sampling depth. + Random downsampling is recommended for concerns relating to sampling depth, while K-means is recommended for + addressing sampling bias. + Please see the help below for a more detailed information."> + <option value="none" selected="true">None</option> + <option value="random">Random</option> + <option value="kmeans">K-means</option> + </param> + <when value="none"> + <!--Do nothing--> + </when> + <when value="random"> + <param name="k" type="integer" value="" label="N sequences" + help="Number of sequences to be taken per deme."/> + <param name="random_seed" type="integer" value="" label="Random seed" optional="true" + help="Random seed to be used for sequence selections."/> + </when> + <when value="kmeans"> + <param name="k" type="integer" value="" label="N sequences" + help="Number of sequences to be taken per deme."/> + <param name="random_seed" type="integer" value="" label="Random seed" optional="true" + help="Random seed to be used for sequence selections."/> + </when> + </conditional> + + + <!-- RESUME RUN OPTIONS --> + <!-- ================== --> + <conditional name="resume"> + <param name="selector" type="select" label="Resume from a previous run?" + help="If you ran BEAST but found it didn't run long enough, you can resume your run from where you left off. + Note you will need to specify some of the previous run's outputs here and in the 'BEAST file specification' above, + and the only other parameters that will have any effect on the run will be the number of samples. + Note also this probably isn't worth doing if the previous run didn't take very long."> + <option value="false" selected="true">No thanks</option> + <option value="true">Yes please</option> + </param> + <when value="false"> + <!--Do nothing--> + </when> + <when value="true"> + <param name="logfile" type="data" format="full_logfile" label="Logfile from last run" + help="Cannot be the 'trimmed' output from previous run, but must be the full output."/> + <param name="treefile" type="data" format="full_treefile" label="Treefile from last run" + help="As with the Logfile, cannot be the `trimmed` output."/> + <param name="statefile" type="data" format="statefile" label="State file from previous run."/> + <param name="samples" type="integer" value="10000" label="Number of samples to keep" + help="With resume runs, the log and tree files can grow quite large, bogging down PACT and other analysis tools. + Therefore, after a resume run, 'trimmed' output files are created with this number of samples."/> + </when> + </conditional> + + </inputs> + + <outputs> + <data name="logfile" format="full_logfile" label="BEAST logfile"/> + <data name="treefile" format="full_treefile" label="BEAST treefile"/> + <data name="statefile" format="statefile" label="BEAST state file"/> + <data name="ess" format="html" label="BEAST effective sample size stats"/> + + <data name="formatted_beastfile" format="beastfile" label="BEAST config file"> + <filter>resume['selector'] != "true"</filter> + </data> + <data name="downsampled_alignment" format="fasta" label="Downsampled alignment"> + <filter>downsampling['method'] != "none"</filter> + </data> + <data name="downsampled_metadata" format="csv" label="Downsampled metadata"> + <filter>downsampling['method'] != "none" and metadata['specification'] == "file"</filter> + </data> + <data name="trimmed_logfile" format="trimmed_logfile" label="Trimmed BEAST logfile"> + <filter>resume['selector'] == "true"</filter> + </data> + <data name="trimmed_treefile" format="trimmed_treefile" label="Trimmed BEAST treefile"> + <filter>resume['selector'] == "true"</filter> + </data> + </outputs> + + <configfiles> + <!-- XXX Not sure if these deme specifications will actually work or not ??? --> + <configfile name="config"> + +## Specification of alignment +#if $alignment.specification == "file" +ALIGNMENT="${alignment.file}" +#else +ALIGNMENT="" +#end if + + +METADATA_SPECIFICATION="${metadata.specification}" +#if $metadata.specification == "file" +METADATA_FILE="${metadata.file}" +DEME_COLUMN="${metadata.deme_column}" +#if str($metadata.date_column) not in ["None", ""] +DATE_COLUMN="${metadata.date_column}" +#else +DATE_COLUMN="" +#end if +#elif $metadata.specification == "regex" +DEME_REGEX='${metadata.deme_regex}' +#if str($metadata.date_regex) != 'None' +DATE_REGEX='${metadata.date_regex}' +#else +DATE_REGEX='' +#end if +#else +METADATA_FILE="" +DEME_REGEX="" +DEME_COLUMN="" +DATE_COLUMN="" +DATE_REGEX="" +#end if + +SAMPLES_FLAG="-s ${samples}" +SAMPLING_INTERVAL_FLAG="-i ${sampling_interval}" +#if str($random_seed) not in ["None", ""] +RANDOM_SEED_FLAG="-seed ${random_seed}" +#else +RANDOM_SEED_FLAG="" +#end if + +BEASTFILE_SPECIFICATION="${beastfile.specification}" +#if $beastfile.specification == "custom" +BEASTFILE_TEMPLATE="${beastfile.template}" +#end if + +RESUME_SELECTOR="${resume.selector}" +#if $resume.selector == "true" +RESUME_LOGFILE="${resume.logfile}" +RESUME_TREEFILE="${resume.treefile}" +RESUME_STATEFILE="${resume.statefile}" +RESUME_SAMPLES="${resume.samples}" +#end if + +DOWNSAMPLING_METHOD="${downsampling.method}" +#if $downsampling.method != "none" +DOWNSAMPLING_K="${downsampling.k}" +#if str($downsampling.random_seed) != "None" +DOWNSAMPLING_RANDOM_SEED="${downsampling.random_seed}" +#end if +#end if + + + +LOGFILE="${logfile}" +TREEFILE="${treefile}" +ESS="${ess}" + +#if $resume.selector != "true" +FORMATTED_BEASTFILE="${formatted_beastfile}" +#else +FORMATTED_BEASTFILE="formatted_beastfile.xml" +#end if +STATEFILE="${statefile}" + +#if $downsampling.method != "none" +DOWNSAMPLED_ALIGNMENT="${downsampled_alignment}" +#if $metadata.specification == "file" +DOWNSAMPLED_METADATA="${downsampled_metadata}" +#end if +#end if + +#if $resume.selector == "true" +TRIMMED_LOGFILE="${trimmed_logfile}" +TRIMMED_TREEFILE="${trimmed_treefile}" +#end if + + </configfile> + </configfiles> + + <!-- The contents of the help tag is parsed as reStructuredText. Please see + help-template.rst for examples of commonly-used sections in other Galaxy + tools. --> + <help> + +.. class:: infomark + + +About this tool +--------------- + +This tool is your starting point for running an Argo Navis analysis. +It runs BEAST for you, and produces a treefile suitable for running through the Argo Navis PACT tool, which facilitates +visualization and analyses of your data. + + +Input files +----------- + +To run this tool, you'll need some sequence data, and a way to specify which sequences belong to which demes. +The recommended way of doing this is to specify a FASTA file of aligned sequences, and a separate metadata CSV file with +deme specifications. +This metadata file may also be used to specify temporal information for your sequences. +While the column names pointing to the deme and date information are configurable, there must be a "sequence" column +corresponding to the names of the sequences in the FASTA file. +Additionally, there must be no sequences represented in one file not represented in the other. + +It is also possible to specify deme and date information directly in the FASTA file sequence names using regular expressions. +To do this, select "From sequence names" from the "Metadata specification" dropdown. +This will open up options for deme and date regular expressions which will be used to extract the information from +the sequence names. +The default behaviour is to parse the sequence names as "|" separated values like this: "name|deme|date". +If you'd like help customizing these regular expressions, please see the `regex tutorial`_ and `rubular`_. + +For more custom analyses, you can also specify your own BEAST config file. +This lets you customize model details, priors, and parameters. +If you wish to do this, take a look at the **Customizing your BEAST config file** section below. + + +Output files +------------ + +The following files are created as the result of an Argo Navis BEAST run: + +**BEAST logfile**: A standard BEAST logfile of sampled parameters, suitable for analysis with Tracer. + +**BEAST treefile**: A nexus file of trees from the posterior, with ancestral states labeled, suitable for +analysis with PACT. + +**BEAST effective sample size stats**: Statistics about whether BEAST has run long enough. +You can click on the eye icon next to this file to see this data in-browser. + +Once you've carried out a run, you should always check the ESS statistics before doing anything else. +This file will contain a "Recommendation", indicating whether the data is ready to use, or whether you +should run BEAST longer, and if so, how much longer. + +While the ESS statistic is an easy way to get a sense for whether your should run BEAST more or not, it's also +a good idea to manually review the logfile using `Tracer`_, before accepting the results as final and +passing them along through the rest of the analysis. +Please see the tutorial in the Introduction for a description of how to do this. + + +Resume runs +----------- + +If you run BEAST and realize it needs to run longer, you can save time by doing a "Resume" run. +This let's you pick up from where you left off, keeping you from having to start over. + +Note that in addition to the output files listed above, an initial run of BEAST will also produce the following: + +**BEAST config file**: A copy of the BEAST file created for the run. +This has been modified to include sequence, deme, and date information, as well as various other run settings. + +**BEAST state file**: This is a file that BEAST maintains as it runs so Resume runs can pick up where they left off. + +In order to start a Resume run, start by entering the number of new samples you'd like to add to the data you've already collected. +Next, in the "BEAST file specification" input, select "Custom or Resume". +This will open up a file input from which you can select the BEAST config file from the previous run. +Next, select "Yes" in the "Resume from a previous run?" form input. +This will open up several new input fields where you can specify the logfile, treefile and statefile from your previous run. + +You can also specify a total number of samples you want to keep. +When running a Resume run, your log and tree files can grow large enough to bog down analysis tools. +This problem is dealt with by evenly trimming these files to the specified number of samples during Resume runs. +It's important to note, however, that if you do further Resume runs, you must make sure to specify the **untrimmed** log and +tree files for the following runs. +If you don't, bad things will happen (someone will kill a puppy; seriously). + +It's also important to note that Resume runs ignore all inputs not explicitly mentioned above. +In particular, you can't change the sequences, the date information, deme specifications or sampling interval for a Resume run. +If you *do* want to change any of this, you must start a new run. + + +Random seeds +------------ + +Both the actual BEAST run and the downsampling methods support specification of random seeds. +Taking advantage of this aids in reproducibility of your analyses, as someone else running with the same data, settings and +random seeds should be able to reproduce the results exactly. +If you choose not to specify your own random seeds, random seeds will be chosen for you and printed out in the logs, so +if you wish to go back and find these values, you can. + + +Downsampling +------------ + +Differences in sampling between demes you study can be a challenge when doing these analyses. +How do you know that one deme having been sampled more deeply isn't artificially biasing the results towards increased +diversity for that deme? +It's also frequently the case that researchers studying viruses don't submit every viral sequence they obtain, but decide +which look "different enough" from the others (and from those already observed) to submit to online repositories. +And typically, the strategy used for deciding which sequences to submit and which not to is not made clear in publications. +As such, treating sequences found in these repositories as an unbiased sample is problematic, and has even greater +potential for introducing diversity bias into your results. + +Issues such as these frequently come up in reviewer critiques. +One way to deal with this is to show evidence that the sampling issues aren't significantly affecting the results, +or at the very least, to explore the effects of different sampling strategies on the results. +Argo Navis provides downsampling tools which help you towards this end. + +There are two downsampling methods you can use: Random and K-means. +Random downsampling is meant to deal with issues of only sampling depth, while K-means downsampling is meant to +address issues of sampling bias. +While Random downsampling is likely fairly straight-forward, K-means downsampling may not be. + +K-means downsampling starts by clustering sequences together into K clusters. +From each of these clusters a single sequence is chosen to represent the entire cluster. +This is done for each deme. +This method introduces intentional, but controlled sampling bias, as for any given number of K samples per +deme, the sequences chosen are going to reflect the *most* diversity possible among those sequences. +While this doesn't *solve* the problem of sampling bias, it does attempt to put all demes on equal footing, by +introducing similar diversity bias in all sequences. + +A decent strategy for protecting yourself from reviewers is to use *both* of these methods at various K values, +and compare the results. +Assuming the results look similar in all cases, then great! +It would appear you have a very robust set of conclusions. +If there are differences, there will be a bit more work involved in explaining them, but at least you'll be able to say +you've thoroughly explored the issue. + +On a final note, if you *do* decide to use BEAST's downsampling functionality, you will obtain a couple of additional +outputs: + +**Downsampled alignment**: The sequences kept in the downsampling process, in FASTA format. + +**Downsampled metadata**: If you supplied metadata for the analysis, you will also get a copy of that file containing +metadata for only the sequences kept in the downsampling procedure. + +These are not required for any other part of Argo Navis, but are offered for your convenience so you can see which +sequences were chosen and use the same data subsets in other analyses you might perform outside of Argo Navis. + + +Customizing your BEAST config file +---------------------------------- + +Customizing your BEAST config file opens up the full power of BEAST, letting you specify model details, priors, and +other parameters. +If you wish to do this, it's recommended you start by working from this tool's `default BEAST config file`_. +From there, you can specify customizations using the BEAUti program, which is installed alongside BEAST2 (see the +`BEAST2 homepage`_ for downloads). + +Before you can load the config file, you'll have to install the BEAST Classic module from within BEAUti. +Simply go to File > Manage Packages, click on "BEAST_CLASSIC", then click "Install". +You can now go to File > Load to select the default BEAST config file. +Once you've made your edits, save your new config file by going to File > Save As. +You can then upload the file into Galaxy by going to Get Data > Upload File within the Galaxy Tool menu, making sure to +specify "beastfile" as the file type. + +Once you have your file loaded, select "Custom or resume" from "BEAST file specification", and point the file selector that pops +up to the new config file you just uploaded. +If you wish, you can also tell Argo Navis to use the alignment and deme data in the BEAST config file by selecting the +appropriate options in the "Alignment specification" and "Metadata specification" dropdowns. +If you don't do this, whatever files/options are specified for Sequence alignment and Metadata specification will replace the +data specified in your BEAST config file. + +Some notes to keep in mind if you specify your own BEAST config: + +1. If you choose to specify the deme/community information directly in the your BEAST file and not apply the CSV or regular + expression data to set this, you must name your discrete trait "deme", even if it refers to something else, like + species name or tissue type. +2. Note that the chain length and sampling interval settings in your BEAST config file will get overridden by whatever is + specified here in this tool, and can not be assumed to remain as specified in your BEAST config file. +3. If you don't have tip dates, you can speed up your run by specifying a custom beast file where your fixed clock's rate isn't + estimated but left constant. If you don't have any temporal data, these estimates won't mean anything anyway. + +Note that the default template closely follows the setup in the `Ancestral State Reconstruction tutorial`_ by Remco Bouckaert, +which you can use as a guide for setting up your own custom BEAST analysis. +The `BEAST homepage`_ is also a useful resource if you're looking for help settings things up. + + + +.. _BEAST2 homepage: http://www.beast2.org/ +.. _default BEAST config file: http://xxx.doesntexist.yet +.. _regex tutorial: http://regexone.com/lesson/0 +.. _rubular: http:rubular.com +.. _Tracer: http://tree.bio.ed.ac.uk/software/tracer/ +.. _Ancestral State Reconstruction tutorial: http://beast-classic.googlecode.com/files/ARv2.0.1.pdf +.. _BEAST homepage: http://beast.bio.ed.ac.uk/ + + + </help> +</tool>
