comparison hyphy_bgm.xml @ 36:e45ec7931bfd draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/hyphy/ commit d97b1b98a3a621c93a7ed9e7db16bda47eefcb92
author iuc
date Tue, 07 Oct 2025 20:38:49 +0000
parents d162a1f8e495
children
comparison
equal deleted inserted replaced
35:d162a1f8e495 36:e45ec7931bfd
5 </macros> 5 </macros>
6 <expand macro="bio_tools"/> 6 <expand macro="bio_tools"/>
7 <expand macro="requirements"/> 7 <expand macro="requirements"/>
8 <command detect_errors="exit_code"><![CDATA[ 8 <command detect_errors="exit_code"><![CDATA[
9 @SYMLINK_FILES@ 9 @SYMLINK_FILES@
10 ln -s '$bgm_output' ${input_file}.BGM.json && 10 ENV="TOLERATE_NUMERICAL_ERRORS=1;" @HYPHYMP@ bgm
11 hyphy bgm
12 --alignment $input_file 11 --alignment $input_file
13 @INPUT_TREE@ 12 @INPUT_TREE@
14 --run_type $datatype.value 13 --type $datatype.value
15 #if $datatype.value == "codon": 14 #if $datatype.value == "codon":
16 --code '$datatype.gencodeid' 15 --code '$datatype.gencodeid'
17 #end if 16 #end if
18 #if $datatype.value == "protein": 17 #if $datatype.value == "amino-acid":
19 --baseline_model '$datatype.baseline_model' 18 --baseline_model '$datatype.baseline_model'
20 #end if 19 #end if
21 @branch_options@ 20 @branch_options@
22 --chain '$chain_length' 21 --steps '$chain_length'
23 --burn_in '$burn_in' 22 --burn-in '$burn_in'
24 --samples '$samples' 23 --samples '$samples'
25 --parents '$parents' 24 --max-parents '$parents'
26 --min_subs '$min_subs' 25 --min-subs '$min_subs'
26 --output '$bgm_output'
27 > bgm_stdout.md
27 @ERRORS@ 28 @ERRORS@
28 ]]></command> 29 ]]></command>
29 <inputs> 30 <inputs>
30 <expand macro="inputs" /> 31 <expand macro="inputs" />
31 <conditional name="datatype"> 32 <conditional name="datatype">
41 <when value="codon"> 42 <when value="codon">
42 <expand macro="gencode" /> 43 <expand macro="gencode" />
43 </when> 44 </when>
44 </conditional> 45 </conditional>
45 <expand macro="branches"/> 46 <expand macro="branches"/>
46 <param argument="--chain" name="chain_length" type="integer" value="100000" min="0" max="1000000000" label="Length of MCMC chain"/> 47 <section name="advanced_options" title="Advanced Options" expanded="false">
47 <param argument="--burn_in" type="integer" value="10000" min="0" max="1000000000" label="Number of samples to discard for burn-in"/> 48 <param argument="--steps" name="chain_length" type="integer" value="100000" min="0" max="1000000000" label="Length of MCMC chain" help="The total number of steps in the MCMC simulation."/>
48 <param argument="--samples" type="integer" value="100" min="0" max="100" label="Number of steps to extract from chain sample"/> 49 <param argument="--burn-in" type="integer" value="10000" min="0" max="1000000000" label="Number of samples to discard for burn-in" help="The initial portion of the MCMC chain is often discarded to allow the simulation to converge to the posterior distribution."/>
49 <param argument="--max-parents" name="parents" type="integer" value="1" min="1" max="3" label="Maximum number of parents allowed per node" /> 50 <param argument="--samples" type="integer" value="100" min="0" max="100000" label="Number of steps to extract from chain sample" help="The number of samples to draw from the MCMC chain after the burn-in period."/>
50 <param argument="--min-subs" type="integer" value="1" min="1" max="100000" label="Minimum number of ubstitutions per site to be included in the analysis" /> 51 <param argument="--max-parents" name="parents" type="integer" value="1" min="1" max="3" label="Maximum number of parents allowed per node" help="This parameter controls the complexity of the graphical model. It sets the maximum number of other sites that can directly influence a given site."/>
52 <param argument="--min-subs" type="integer" value="1" min="1" max="1000" label="Minimum number of substitutions per site to be included in the analysis" help="Sites with very few substitutions provide little information for detecting co-evolution. This parameter allows you to filter out such low-complexity sites from the analysis."/>
53 </section>
51 </inputs> 54 </inputs>
52 <outputs> 55 <outputs>
53 <data name="bgm_output" format="hyphy_results.json" /> 56 <data name="bgm_output" format="hyphy_results.json" />
57 <data name="bgm_md_report" format="markdown" from_work_dir="bgm_stdout.md" label="BGM Report (Markdown) for ${tool.name} on ${on_string}" />
54 </outputs> 58 </outputs>
55 <tests> 59 <tests>
56 <test> 60 <test expect_num_outputs="2">
57 <param name="input_file" ftype="fasta" value="bgm-in1.fa"/> 61 <param name="input_file" ftype="fasta" value="bgm-in1.fa"/>
58 <param name="input_nhx" ftype="nhx" value="bgm-in1.nhx"/> 62 <param name="input_nhx" ftype="nhx" value="bgm-in1.nhx"/>
59 <output name="bgm_output" file="bgm-out1.json" compare="sim_size"/> 63 <conditional name="datatype">
64 <param name="value" value="codon"/>
65 <param name="gencodeid" value="Universal"/>
66 </conditional>
67 <output name="bgm_output">
68 <assert_contents>
69 <has_text text="Probability that site 2 is conditionally dependent on site 1"/>
70 <has_text text="analysis"/>
71 </assert_contents>
72 </output>
73 <output name="bgm_md_report">
74 <assert_contents>
75 <has_text text="Analysis Description"/>
76 <has_text text="BGM analysis summary on 9 sites each with at least 1 substitutions"/>
77 </assert_contents>
78 </output>
79 </test>
80 <test expect_num_outputs="2">
81 <param name="input_file" ftype="fasta" value="fade-in1.fa"/>
82 <param name="input_nhx" ftype="nhx" value="fade-in1.nhx"/>
83 <conditional name="datatype">
84 <param name="value" value="amino-acid"/>
85 <param name="baseline_model" value="LG"/>
86 </conditional>
87 <output name="bgm_output">
88 <assert_contents>
89 <has_text text='"samples":100'/>
90 <has_text text="Probability that sites 1 and 2 are not conditionally independent"/>
91 </assert_contents>
92 </output>
93 <output name="bgm_md_report">
94 <assert_contents>
95 <has_text text="pairs of conditionally dependent sites found"/>
96 <has_text text="## BGM analysis summary on 149 sites each with at least 1 substitutions."/>
97 </assert_contents>
98 </output>
99 </test>
100 <test expect_num_outputs="2">
101 <param name="input_file" ftype="fasta" value="bgm-in1.fa"/>
102 <param name="input_nhx" ftype="nhx" value="bgm-in1.nhx"/>
103 <conditional name="datatype">
104 <param name="value" value="nucleotide"/>
105 </conditional>
106 <output name="bgm_output">
107 <assert_contents>
108 <has_text text='"samples":100'/>
109 <has_text text="Probability that sites 1 and 2 are not conditionally independent"/>
110 </assert_contents>
111 </output>
112 <output name="bgm_md_report">
113 <assert_contents>
114 <has_text text=">type => nucleotide"/>
115 <has_text text="BGM analysis summary on 24 sites each with at least 1 substitutions"/>
116 </assert_contents>
117 </output>
60 </test> 118 </test>
61 </tests> 119 </tests>
62 <help><![CDATA[ 120 <help><![CDATA[
63 121
64 BGM : Bayesian Graphical Models 122 BGM : Bayesian Graphical Models for Co-evolving Sites
65 =============================== 123 =====================================================
66 124
67 What does this do? 125 **What does this do?**
68 ------------------
69 126
70 This tools identifies groups of sites in the alignments that experience substitutions along the same branches, 127 This tool identifies groups of sites in a sequence alignment that appear to be co-evolving. Co-evolving sites are those that experience substitutions along the same branches of a phylogenetic tree more often than expected by chance. This pattern of correlated substitutions can imply a functional or structural relationship between the sites. For example, a destabilizing mutation at one site might be compensated for by a mutation at another site to preserve the protein's structure or function.
71 i.g. *co-evolve*.
72 128
73 Brief description 129 **How does it work?**
74 -----------------
75 130
76 GM (Bayesian Graphical Model) uses a maximum likelihood ancestral state 131 BGM employs a Bayesian Graphical Model to uncover these dependencies. The core idea is to represent each site in the alignment as a node in a graph. The algorithm then seeks to find the edges (links) between these nodes that represent statistically significant correlations in substitution patterns.
77 reconstruction to map substitution (non-synonymous only for coding data)
78 events to branches in the phylogeny and then analyzes the joint
79 distribution of the substitution map using a Bayesian graphical model
80 (network). Next, a Markov chain Monte Carlo analysis is used to generate
81 a random sample of network structures from the posterior distribution
82 given the data. Each node in the network represents a site in the
83 alignment, and links (edges) between nodes indicate high posterior
84 support for correlated substitutions at the two sites over time, which
85 implies coevolution.
86 132
133 The process involves several steps:
87 134
88 Input 135 1. **Ancestral State Reconstruction:** First, the method reconstructs the evolutionary history of the sequences using maximum likelihood. This allows the tool to map substitution events to specific branches of the phylogenetic tree. For coding data, only non-synonymous substitutions (those that change the amino acid) are considered.
89 -----
90 136
91 1. A *FASTA* sequence alignment. 137 2. **Bayesian Graphical Model:** The joint distribution of these substitution maps is then analyzed using a Bayesian graphical model. This model represents the probability of the observed substitution patterns given a particular network of dependencies between sites.
92 2. A phylogenetic tree in the *Newick* format
93 138
94 Note: the names of sequences in the alignment must match the names of the sequences in the tree. 139 3. **MCMC Sampling:** To explore the vast space of possible network structures, BGM uses a Markov Chain Monte Carlo (MCMC) analysis. This method generates a random sample of network structures from the posterior distribution, meaning it finds the networks that are most likely given the data.
95 140
96 Output 141 4. **Identifying Co-evolving Sites:** The links (edges) that appear most frequently in the sampled networks are the ones with the highest posterior support. These links connect the sites that are most likely to be co-evolving.
97 ------
98 142
99 A JSON file with analysis results (http://hyphy.org/resources/json-fields.pdf). 143 **Input**
100 144
101 A custom visualization module for viewing these results is available (see http://vision.hyphy.org/BGM for an example) 145 * A multiple sequence alignment in FASTA or NEXUS format.
146 * A phylogenetic tree in Newick format. The names of the sequences in the alignment must match the names of the tips in the tree.
102 147
103 Further reading 148 **Output**
104 ---------------
105 149
106 http://hyphy.org/methods/selection-methods/#BGM 150 * **JSON file:** A JSON file containing the detailed results of the analysis, including the posterior probabilities of the links between sites. (See http://hyphy.org/resources/json-fields.pdf for a description of the fields).
151 * **Markdown report:** A summary report in Markdown format.
107 152
153 A custom visualization module for viewing BGM results is available at http://vision.hyphy.org/BGM.
108 154
109 Tool options 155 **Tool Options**
110 ------------
111 ::
112 156
113 --branches Which branches should be tested for selection? 157 * **Type of data:** The type of sequence data in the alignment file.
114 All [default] : test all branches 158 * `nucleotide`: For DNA or RNA sequences.
159 * `amino-acid`: For protein sequences.
160 * `codon`: For coding DNA sequences. This is the default.
115 161
116 Internal : test only internal branches (suitable for 162 * **Genetic code:** If using codon data, the genetic code to use for translation.
117 intra-host pathogen evolution for example, where terminal branches
118 may contain polymorphism data)
119 163
120 Leaves: test only terminal (leaf) branches 164 * **Substitution model:** If using amino-acid data, the substitution model to use.
121 165
122 Unlabeled: if the Newick string is labeled using the {} notation, 166 * **Set of branches to test:** The set of branches in the phylogeny to consider for the analysis.
123 test only branches without explicit labels 167 * `All branches`: (Default) Use all branches in the tree.
124 (see http://hyphy.org/tutorials/phylotree/) 168 * `Internal branches`: Use only the internal branches. This can be useful for studying pathogen evolution within a host, for example, where terminal branches might represent polymorphism rather than fixed differences.
169 * `Leaf branches`: Use only the terminal (leaf) branches.
170 * `Unlabeled branches`: If the Newick tree is annotated with labels, use only the branches that do not have a label.
171 * `Custom`: Specify a custom set of branches by providing a label.
125 172
126 --max-parents The maximum number of parents allowed per node, i.e. how many sites 173 * **Length of MCMC chain:** The total number of steps in the MCMC simulation. A longer chain will explore the space of possible networks more thoroughly but will take longer to run.
127 can directly influence substitution patterns at another site
128 Increasing this number scales complexity nonlinearly
129 default value: 1
130 174
131 --min-subs The minium number of substitutions per site to include it in the analysis 175 * **Number of samples to discard for burn-in:** The initial portion of the MCMC chain is often discarded to allow the simulation to converge to the posterior distribution. This parameter specifies the number of initial samples to discard.
132 Filter low complexity (too few substitution) sites
133 default value: 1
134 176
135 --chains How many MCMC chains to run (does not apply to Variational-Bayes) 177 * **Number of steps to extract from chain sample:** The number of samples to draw from the MCMC chain after the burn-in period. These samples are used to estimate the posterior probabilities of the links.
136 default value: 5
137 178
138 --steps MCMC chain length (does not apply to Variational-Bayes) 179 * **Maximum number of parents allowed per node:** This parameter controls the complexity of the graphical model. It sets the maximum number of other sites that can directly influence a given site. Increasing this number can reveal more complex dependency networks but also significantly increases the computational complexity.
139 default value: 100,000
140 180
141 --burn-in MCMC chain burn in (does not apply to Variational-Bayes) 181 * **Minimum number of substitutions per site:** Sites with very few substitutions provide little information for detecting co-evolution. This parameter allows you to filter out such low-complexity sites from the analysis.
142 default value: 10,000
143 182
144 --samples MCMC samples to draw (does not apply to Variational-Bayes) 183 **Further Reading**
145 default value: 100
146 184
185 For more information, please see the HyPhy documentation: http://hyphy.org/methods/selection-methods/#BGM
147 186
148 ]]></help> 187 ]]></help>
149 <expand macro="citations"> 188 <expand macro="citations">
150 <citation type="doi">10.1371/journal.pcbi.0030231</citation> 189 <citation type="doi">10.1371/journal.pcbi.0030231</citation>
151 </expand> 190 </expand>