Mercurial > repos > iuc > hyphy_bgm
comparison hyphy_bgm.xml @ 36:e45ec7931bfd draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hyphy/ commit d97b1b98a3a621c93a7ed9e7db16bda47eefcb92
| author | iuc |
|---|---|
| date | Tue, 07 Oct 2025 20:38:49 +0000 |
| parents | d162a1f8e495 |
| children |
comparison
equal
deleted
inserted
replaced
| 35:d162a1f8e495 | 36:e45ec7931bfd |
|---|---|
| 5 </macros> | 5 </macros> |
| 6 <expand macro="bio_tools"/> | 6 <expand macro="bio_tools"/> |
| 7 <expand macro="requirements"/> | 7 <expand macro="requirements"/> |
| 8 <command detect_errors="exit_code"><![CDATA[ | 8 <command detect_errors="exit_code"><![CDATA[ |
| 9 @SYMLINK_FILES@ | 9 @SYMLINK_FILES@ |
| 10 ln -s '$bgm_output' ${input_file}.BGM.json && | 10 ENV="TOLERATE_NUMERICAL_ERRORS=1;" @HYPHYMP@ bgm |
| 11 hyphy bgm | |
| 12 --alignment $input_file | 11 --alignment $input_file |
| 13 @INPUT_TREE@ | 12 @INPUT_TREE@ |
| 14 --run_type $datatype.value | 13 --type $datatype.value |
| 15 #if $datatype.value == "codon": | 14 #if $datatype.value == "codon": |
| 16 --code '$datatype.gencodeid' | 15 --code '$datatype.gencodeid' |
| 17 #end if | 16 #end if |
| 18 #if $datatype.value == "protein": | 17 #if $datatype.value == "amino-acid": |
| 19 --baseline_model '$datatype.baseline_model' | 18 --baseline_model '$datatype.baseline_model' |
| 20 #end if | 19 #end if |
| 21 @branch_options@ | 20 @branch_options@ |
| 22 --chain '$chain_length' | 21 --steps '$chain_length' |
| 23 --burn_in '$burn_in' | 22 --burn-in '$burn_in' |
| 24 --samples '$samples' | 23 --samples '$samples' |
| 25 --parents '$parents' | 24 --max-parents '$parents' |
| 26 --min_subs '$min_subs' | 25 --min-subs '$min_subs' |
| 26 --output '$bgm_output' | |
| 27 > bgm_stdout.md | |
| 27 @ERRORS@ | 28 @ERRORS@ |
| 28 ]]></command> | 29 ]]></command> |
| 29 <inputs> | 30 <inputs> |
| 30 <expand macro="inputs" /> | 31 <expand macro="inputs" /> |
| 31 <conditional name="datatype"> | 32 <conditional name="datatype"> |
| 41 <when value="codon"> | 42 <when value="codon"> |
| 42 <expand macro="gencode" /> | 43 <expand macro="gencode" /> |
| 43 </when> | 44 </when> |
| 44 </conditional> | 45 </conditional> |
| 45 <expand macro="branches"/> | 46 <expand macro="branches"/> |
| 46 <param argument="--chain" name="chain_length" type="integer" value="100000" min="0" max="1000000000" label="Length of MCMC chain"/> | 47 <section name="advanced_options" title="Advanced Options" expanded="false"> |
| 47 <param argument="--burn_in" type="integer" value="10000" min="0" max="1000000000" label="Number of samples to discard for burn-in"/> | 48 <param argument="--steps" name="chain_length" type="integer" value="100000" min="0" max="1000000000" label="Length of MCMC chain" help="The total number of steps in the MCMC simulation."/> |
| 48 <param argument="--samples" type="integer" value="100" min="0" max="100" label="Number of steps to extract from chain sample"/> | 49 <param argument="--burn-in" type="integer" value="10000" min="0" max="1000000000" label="Number of samples to discard for burn-in" help="The initial portion of the MCMC chain is often discarded to allow the simulation to converge to the posterior distribution."/> |
| 49 <param argument="--max-parents" name="parents" type="integer" value="1" min="1" max="3" label="Maximum number of parents allowed per node" /> | 50 <param argument="--samples" type="integer" value="100" min="0" max="100000" label="Number of steps to extract from chain sample" help="The number of samples to draw from the MCMC chain after the burn-in period."/> |
| 50 <param argument="--min-subs" type="integer" value="1" min="1" max="100000" label="Minimum number of ubstitutions per site to be included in the analysis" /> | 51 <param argument="--max-parents" name="parents" type="integer" value="1" min="1" max="3" label="Maximum number of parents allowed per node" help="This parameter controls the complexity of the graphical model. It sets the maximum number of other sites that can directly influence a given site."/> |
| 52 <param argument="--min-subs" type="integer" value="1" min="1" max="1000" label="Minimum number of substitutions per site to be included in the analysis" help="Sites with very few substitutions provide little information for detecting co-evolution. This parameter allows you to filter out such low-complexity sites from the analysis."/> | |
| 53 </section> | |
| 51 </inputs> | 54 </inputs> |
| 52 <outputs> | 55 <outputs> |
| 53 <data name="bgm_output" format="hyphy_results.json" /> | 56 <data name="bgm_output" format="hyphy_results.json" /> |
| 57 <data name="bgm_md_report" format="markdown" from_work_dir="bgm_stdout.md" label="BGM Report (Markdown) for ${tool.name} on ${on_string}" /> | |
| 54 </outputs> | 58 </outputs> |
| 55 <tests> | 59 <tests> |
| 56 <test> | 60 <test expect_num_outputs="2"> |
| 57 <param name="input_file" ftype="fasta" value="bgm-in1.fa"/> | 61 <param name="input_file" ftype="fasta" value="bgm-in1.fa"/> |
| 58 <param name="input_nhx" ftype="nhx" value="bgm-in1.nhx"/> | 62 <param name="input_nhx" ftype="nhx" value="bgm-in1.nhx"/> |
| 59 <output name="bgm_output" file="bgm-out1.json" compare="sim_size"/> | 63 <conditional name="datatype"> |
| 64 <param name="value" value="codon"/> | |
| 65 <param name="gencodeid" value="Universal"/> | |
| 66 </conditional> | |
| 67 <output name="bgm_output"> | |
| 68 <assert_contents> | |
| 69 <has_text text="Probability that site 2 is conditionally dependent on site 1"/> | |
| 70 <has_text text="analysis"/> | |
| 71 </assert_contents> | |
| 72 </output> | |
| 73 <output name="bgm_md_report"> | |
| 74 <assert_contents> | |
| 75 <has_text text="Analysis Description"/> | |
| 76 <has_text text="BGM analysis summary on 9 sites each with at least 1 substitutions"/> | |
| 77 </assert_contents> | |
| 78 </output> | |
| 79 </test> | |
| 80 <test expect_num_outputs="2"> | |
| 81 <param name="input_file" ftype="fasta" value="fade-in1.fa"/> | |
| 82 <param name="input_nhx" ftype="nhx" value="fade-in1.nhx"/> | |
| 83 <conditional name="datatype"> | |
| 84 <param name="value" value="amino-acid"/> | |
| 85 <param name="baseline_model" value="LG"/> | |
| 86 </conditional> | |
| 87 <output name="bgm_output"> | |
| 88 <assert_contents> | |
| 89 <has_text text='"samples":100'/> | |
| 90 <has_text text="Probability that sites 1 and 2 are not conditionally independent"/> | |
| 91 </assert_contents> | |
| 92 </output> | |
| 93 <output name="bgm_md_report"> | |
| 94 <assert_contents> | |
| 95 <has_text text="pairs of conditionally dependent sites found"/> | |
| 96 <has_text text="## BGM analysis summary on 149 sites each with at least 1 substitutions."/> | |
| 97 </assert_contents> | |
| 98 </output> | |
| 99 </test> | |
| 100 <test expect_num_outputs="2"> | |
| 101 <param name="input_file" ftype="fasta" value="bgm-in1.fa"/> | |
| 102 <param name="input_nhx" ftype="nhx" value="bgm-in1.nhx"/> | |
| 103 <conditional name="datatype"> | |
| 104 <param name="value" value="nucleotide"/> | |
| 105 </conditional> | |
| 106 <output name="bgm_output"> | |
| 107 <assert_contents> | |
| 108 <has_text text='"samples":100'/> | |
| 109 <has_text text="Probability that sites 1 and 2 are not conditionally independent"/> | |
| 110 </assert_contents> | |
| 111 </output> | |
| 112 <output name="bgm_md_report"> | |
| 113 <assert_contents> | |
| 114 <has_text text=">type => nucleotide"/> | |
| 115 <has_text text="BGM analysis summary on 24 sites each with at least 1 substitutions"/> | |
| 116 </assert_contents> | |
| 117 </output> | |
| 60 </test> | 118 </test> |
| 61 </tests> | 119 </tests> |
| 62 <help><![CDATA[ | 120 <help><![CDATA[ |
| 63 | 121 |
| 64 BGM : Bayesian Graphical Models | 122 BGM : Bayesian Graphical Models for Co-evolving Sites |
| 65 =============================== | 123 ===================================================== |
| 66 | 124 |
| 67 What does this do? | 125 **What does this do?** |
| 68 ------------------ | |
| 69 | 126 |
| 70 This tools identifies groups of sites in the alignments that experience substitutions along the same branches, | 127 This tool identifies groups of sites in a sequence alignment that appear to be co-evolving. Co-evolving sites are those that experience substitutions along the same branches of a phylogenetic tree more often than expected by chance. This pattern of correlated substitutions can imply a functional or structural relationship between the sites. For example, a destabilizing mutation at one site might be compensated for by a mutation at another site to preserve the protein's structure or function. |
| 71 i.g. *co-evolve*. | |
| 72 | 128 |
| 73 Brief description | 129 **How does it work?** |
| 74 ----------------- | |
| 75 | 130 |
| 76 GM (Bayesian Graphical Model) uses a maximum likelihood ancestral state | 131 BGM employs a Bayesian Graphical Model to uncover these dependencies. The core idea is to represent each site in the alignment as a node in a graph. The algorithm then seeks to find the edges (links) between these nodes that represent statistically significant correlations in substitution patterns. |
| 77 reconstruction to map substitution (non-synonymous only for coding data) | |
| 78 events to branches in the phylogeny and then analyzes the joint | |
| 79 distribution of the substitution map using a Bayesian graphical model | |
| 80 (network). Next, a Markov chain Monte Carlo analysis is used to generate | |
| 81 a random sample of network structures from the posterior distribution | |
| 82 given the data. Each node in the network represents a site in the | |
| 83 alignment, and links (edges) between nodes indicate high posterior | |
| 84 support for correlated substitutions at the two sites over time, which | |
| 85 implies coevolution. | |
| 86 | 132 |
| 133 The process involves several steps: | |
| 87 | 134 |
| 88 Input | 135 1. **Ancestral State Reconstruction:** First, the method reconstructs the evolutionary history of the sequences using maximum likelihood. This allows the tool to map substitution events to specific branches of the phylogenetic tree. For coding data, only non-synonymous substitutions (those that change the amino acid) are considered. |
| 89 ----- | |
| 90 | 136 |
| 91 1. A *FASTA* sequence alignment. | 137 2. **Bayesian Graphical Model:** The joint distribution of these substitution maps is then analyzed using a Bayesian graphical model. This model represents the probability of the observed substitution patterns given a particular network of dependencies between sites. |
| 92 2. A phylogenetic tree in the *Newick* format | |
| 93 | 138 |
| 94 Note: the names of sequences in the alignment must match the names of the sequences in the tree. | 139 3. **MCMC Sampling:** To explore the vast space of possible network structures, BGM uses a Markov Chain Monte Carlo (MCMC) analysis. This method generates a random sample of network structures from the posterior distribution, meaning it finds the networks that are most likely given the data. |
| 95 | 140 |
| 96 Output | 141 4. **Identifying Co-evolving Sites:** The links (edges) that appear most frequently in the sampled networks are the ones with the highest posterior support. These links connect the sites that are most likely to be co-evolving. |
| 97 ------ | |
| 98 | 142 |
| 99 A JSON file with analysis results (http://hyphy.org/resources/json-fields.pdf). | 143 **Input** |
| 100 | 144 |
| 101 A custom visualization module for viewing these results is available (see http://vision.hyphy.org/BGM for an example) | 145 * A multiple sequence alignment in FASTA or NEXUS format. |
| 146 * A phylogenetic tree in Newick format. The names of the sequences in the alignment must match the names of the tips in the tree. | |
| 102 | 147 |
| 103 Further reading | 148 **Output** |
| 104 --------------- | |
| 105 | 149 |
| 106 http://hyphy.org/methods/selection-methods/#BGM | 150 * **JSON file:** A JSON file containing the detailed results of the analysis, including the posterior probabilities of the links between sites. (See http://hyphy.org/resources/json-fields.pdf for a description of the fields). |
| 151 * **Markdown report:** A summary report in Markdown format. | |
| 107 | 152 |
| 153 A custom visualization module for viewing BGM results is available at http://vision.hyphy.org/BGM. | |
| 108 | 154 |
| 109 Tool options | 155 **Tool Options** |
| 110 ------------ | |
| 111 :: | |
| 112 | 156 |
| 113 --branches Which branches should be tested for selection? | 157 * **Type of data:** The type of sequence data in the alignment file. |
| 114 All [default] : test all branches | 158 * `nucleotide`: For DNA or RNA sequences. |
| 159 * `amino-acid`: For protein sequences. | |
| 160 * `codon`: For coding DNA sequences. This is the default. | |
| 115 | 161 |
| 116 Internal : test only internal branches (suitable for | 162 * **Genetic code:** If using codon data, the genetic code to use for translation. |
| 117 intra-host pathogen evolution for example, where terminal branches | |
| 118 may contain polymorphism data) | |
| 119 | 163 |
| 120 Leaves: test only terminal (leaf) branches | 164 * **Substitution model:** If using amino-acid data, the substitution model to use. |
| 121 | 165 |
| 122 Unlabeled: if the Newick string is labeled using the {} notation, | 166 * **Set of branches to test:** The set of branches in the phylogeny to consider for the analysis. |
| 123 test only branches without explicit labels | 167 * `All branches`: (Default) Use all branches in the tree. |
| 124 (see http://hyphy.org/tutorials/phylotree/) | 168 * `Internal branches`: Use only the internal branches. This can be useful for studying pathogen evolution within a host, for example, where terminal branches might represent polymorphism rather than fixed differences. |
| 169 * `Leaf branches`: Use only the terminal (leaf) branches. | |
| 170 * `Unlabeled branches`: If the Newick tree is annotated with labels, use only the branches that do not have a label. | |
| 171 * `Custom`: Specify a custom set of branches by providing a label. | |
| 125 | 172 |
| 126 --max-parents The maximum number of parents allowed per node, i.e. how many sites | 173 * **Length of MCMC chain:** The total number of steps in the MCMC simulation. A longer chain will explore the space of possible networks more thoroughly but will take longer to run. |
| 127 can directly influence substitution patterns at another site | |
| 128 Increasing this number scales complexity nonlinearly | |
| 129 default value: 1 | |
| 130 | 174 |
| 131 --min-subs The minium number of substitutions per site to include it in the analysis | 175 * **Number of samples to discard for burn-in:** The initial portion of the MCMC chain is often discarded to allow the simulation to converge to the posterior distribution. This parameter specifies the number of initial samples to discard. |
| 132 Filter low complexity (too few substitution) sites | |
| 133 default value: 1 | |
| 134 | 176 |
| 135 --chains How many MCMC chains to run (does not apply to Variational-Bayes) | 177 * **Number of steps to extract from chain sample:** The number of samples to draw from the MCMC chain after the burn-in period. These samples are used to estimate the posterior probabilities of the links. |
| 136 default value: 5 | |
| 137 | 178 |
| 138 --steps MCMC chain length (does not apply to Variational-Bayes) | 179 * **Maximum number of parents allowed per node:** This parameter controls the complexity of the graphical model. It sets the maximum number of other sites that can directly influence a given site. Increasing this number can reveal more complex dependency networks but also significantly increases the computational complexity. |
| 139 default value: 100,000 | |
| 140 | 180 |
| 141 --burn-in MCMC chain burn in (does not apply to Variational-Bayes) | 181 * **Minimum number of substitutions per site:** Sites with very few substitutions provide little information for detecting co-evolution. This parameter allows you to filter out such low-complexity sites from the analysis. |
| 142 default value: 10,000 | |
| 143 | 182 |
| 144 --samples MCMC samples to draw (does not apply to Variational-Bayes) | 183 **Further Reading** |
| 145 default value: 100 | |
| 146 | 184 |
| 185 For more information, please see the HyPhy documentation: http://hyphy.org/methods/selection-methods/#BGM | |
| 147 | 186 |
| 148 ]]></help> | 187 ]]></help> |
| 149 <expand macro="citations"> | 188 <expand macro="citations"> |
| 150 <citation type="doi">10.1371/journal.pcbi.0030231</citation> | 189 <citation type="doi">10.1371/journal.pcbi.0030231</citation> |
| 151 </expand> | 190 </expand> |
