Mercurial > repos > iuc > beagle
comparison beagle.xml @ 0:117d42db0a30 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/beagle commit ccb3f8eaa99490f8513200e45fc59e5011fb41e8"
| author | iuc |
|---|---|
| date | Sat, 03 Jul 2021 23:33:08 +0000 |
| parents | |
| children | d0a6954d0a0a |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:117d42db0a30 |
|---|---|
| 1 <tool id='beagle' name='Beagle' version='@TOOL_VERSION@+galaxy@SUFFIX_VERSION@' profile='20.01'> | |
| 2 <description>phasing genotypes and imputing ungenotyped markers</description> | |
| 3 <macros> | |
| 4 <import>macros.xml</import> | |
| 5 </macros> | |
| 6 <expand macro='edam_ontology' /> | |
| 7 <expand macro='requirements' /> | |
| 8 <command detect_errors='exit_code'><![CDATA[ | |
| 9 #set out_prefix='out' | |
| 10 #if $optional_inputs.ref.ext == 'bref3' | |
| 11 ln -s '${optional_inputs.ref}' ref.bref3 && | |
| 12 #end if | |
| 13 beagle | |
| 14 gt='${gt}' | |
| 15 #if $optional_inputs.ref and $optional_inputs.ref.ext == 'bref3' | |
| 16 ref=ref.bref3 | |
| 17 #else if $optional_inputs.ref | |
| 18 ref='${optional_inputs.ref}' | |
| 19 #end if | |
| 20 #if $optional_inputs.map | |
| 21 map='${optional_inputs.map}' | |
| 22 #end if | |
| 23 #if $chrom | |
| 24 chrom='${chrom}' | |
| 25 #end if | |
| 26 #if $optional_inputs.excludesamples | |
| 27 excludesamples='${optional_inputs.excludesamples}' | |
| 28 #end if | |
| 29 #if $optional_inputs.excludemarkers | |
| 30 excludemarkers='${optional_inputs.excludemarkers}' | |
| 31 #end if | |
| 32 ne=$ne | |
| 33 window=$window | |
| 34 overlap=$overlap | |
| 35 #if $seed | |
| 36 seed=$seed | |
| 37 #end if | |
| 38 #if $err | |
| 39 err=$err | |
| 40 #end if | |
| 41 burnin=$phasing_parameters.burnin | |
| 42 iterations=$phasing_parameters.iterations | |
| 43 phase-states=$phasing_parameters.phase_states | |
| 44 impute=$imputation_parameters.impute | |
| 45 imp-states=$imputation_parameters.imp_states | |
| 46 imp-segment=$imputation_parameters.imp_segment | |
| 47 imp-step=$imputation_parameters.imp_step | |
| 48 cluster=$imputation_parameters.cluster | |
| 49 ap=$imputation_parameters.ap | |
| 50 gp=$imputation_parameters.gp | |
| 51 out=$out_prefix | |
| 52 nthreads=\${GALAXY_SLOTS:-1} | |
| 53 && gunzip 'out.vcf.gz' | |
| 54 ]]> </command> | |
| 55 <inputs> | |
| 56 <param argument="gt" type="data" format="vcf" label="VCF file" | |
| 57 help="It specifies a VCF file containing genotypes for the study samples. | |
| 58 Each VCF record must contain a GT (genotype) format field"/> | |
| 59 <section name="optional_inputs" title="Optional input files" expanded="true"> | |
| 60 <param argument="ref" type="data" format="vcf,bref3" optional="true" label="Bref3 or VCF file with phased genotypes" | |
| 61 help="Each genotype must have two phased, non-missing alleles. If a VCF file is specified, the | |
| 62 phased allele separator must be used '|'"/> | |
| 63 <param argument="map" type="data" format="txt" optional="true" label="PLINK map file with cM units" | |
| 64 help="Beagle uses linear interpolation to estimate genetic positions between map positions. If | |
| 65 no genetic map is specified, Beagle assumes a constant recombination rate of 1 cM per Mb"/> | |
| 66 <param argument="excludesamples" type="data" format="txt" optional="true" label="Samples to exclude" | |
| 67 help="It specifies a file containing samples (one sample identifier per line) to be excluded | |
| 68 from the analysis" /> | |
| 69 <param argument="excludemarkers" type="data" format="txt" optional="true" label="Markers to exclude" | |
| 70 help="It specifies a file containing markers (one marker per line) to be excluded from the | |
| 71 analysis. Each line of the file can be either an identifier from a VCF recordβs ID field | |
| 72 or a genomic coordinate in the format: CHROM:POS" /> | |
| 73 </section> | |
| 74 <param argument="chrom" type="text" optional="true" label="Specify a chromosome interval" | |
| 75 help="Input format: [chrom]:[start]-[end]. The entire chromosome, the beginning, or the end may be | |
| 76 specified by chrom=[chrom], chrom=[chrom]:-[end], and chrom=[chrom]:[start]-, respectively"> | |
| 77 <sanitizer invalid_char=""> | |
| 78 <valid initial="string.letters,string.digits"> | |
| 79 <add value=":" /> | |
| 80 <add value="-" /> | |
| 81 </valid> | |
| 82 </sanitizer> | |
| 83 <validator type="regex">[0-9a-zA-Z:-]+</validator> | |
| 84 </param> | |
| 85 <param argument="ne" type="integer" min="0" value="1000000" label="Effective population size" | |
| 86 help="The default value is suitable for a large, outbred population. It is needed to specify an | |
| 87 appropriate effective populations size if you are imputing ungenotyped markers in a small | |
| 88 or inbred population"/> | |
| 89 <param argument="window" type="float" min="0" value="40.0" label="Window length in cM" | |
| 90 help="The window parameter must be at least 1.1 times as large as the overlap parameter. | |
| 91 The window parameter controls the amount of memory required for the analysis"/> | |
| 92 <param argument="overlap" type="float" min="0" value="2.0" label="Window overlap in cM" | |
| 93 help="It specifies the cM length of overlap between adjacent sliding windows"/> | |
| 94 <param argument="err" type="float" min="0" max="1" optional="true" | |
| 95 label="Allele mismatch probability for the hidden Markov model" | |
| 96 help="If no err parameter is specified, the err parameter will be set equal π/(2(π + π»)) | |
| 97 where π = 1/(0.5 + ln π») and π» is the number of haplotypes"/> | |
| 98 <param argument="seed" type="integer" value="" optional="true" label="Random seed" | |
| 99 help="A random seed is a number used to initialize a pseudorandom number generator" /> | |
| 100 <param name="output_log" type="boolean" checked="false" label="Output a log file"/> | |
| 101 <section name="phasing_parameters" title="Phasing parameters"> | |
| 102 <param argument="burnin" type="integer" min="0" value="3" label="Max burnin iterations" | |
| 103 help="It is the maximum number of burnin iterations used to estimate an initial haplotype | |
| 104 frequency model for inferring genotype phase" /> | |
| 105 <param argument="iterations" type="integer" min="0" value="12" label="Phasing iterations" | |
| 106 help="It is the number of iterations used to estimate genotype phase. Increasing this | |
| 107 parameter will trade increased computation time for increased phasing accuracy" /> | |
| 108 <param argument="phase-states" type="integer" min="0" value="280" label="Model states for phasing" | |
| 109 help="It is the number of model states used to estimate genotype phase" /> | |
| 110 </section> | |
| 111 <section name="imputation_parameters" title="Imputation parameters"> | |
| 112 <param argument="impute" type="boolean" truevalue="true" falsevalue="false" | |
| 113 checked="true" label="Impute ungenotyped markers" | |
| 114 help="It specifies whether markers that are present in the reference panel but absent in | |
| 115 that target will be imputed. This option has no effect if no reference panel is specified"/> | |
| 116 <param argument="imp-states" type="integer" min="0" value="1600" label="Model states for imputation" | |
| 117 help="It is the number of model states used to impute ungenotyped markers" /> | |
| 118 <param argument="imp-segment" type="float" min="0" value="6.0" label="Minimum cM length of haplotype segments" | |
| 119 help="It is the minimum cM length of haplotype segments that will be incorporated in the HMM state | |
| 120 space for a target haplotype." /> | |
| 121 <param argument="imp-step" type="float" min="0" value="0.1" label="Length in cM for detecting short IBS segments" | |
| 122 help="It is the length in cM of the step used for detecting short IBS segments" /> | |
| 123 <param argument="cluster" type="float" min="0" value="0.005" label="Max cM in a marker cluster" | |
| 124 help="It specifies the maximum cM distance between individual markers that are combined | |
| 125 into an aggregate marker when imputing ungenotyped markers" /> | |
| 126 <param argument="ap" type="boolean" truevalue="true" falsevalue="false" | |
| 127 checked="false" label="Include posterior allele probabilities" | |
| 128 help="It specifies whether AP1 and AP2 (allele probability) fields will be included in the output | |
| 129 VCF file when imputing ungenotyped markers" /> | |
| 130 <param argument="gp" type="boolean" truevalue="true" falsevalue="false" | |
| 131 checked="false" label="Include posterior genotype probabilities" | |
| 132 help="It specifies whether a GP (genotype probability) format field will be included in the output | |
| 133 VCF file when imputing ungenotyped markers. Genotype probabilities are calculated from allele | |
| 134 probabilities assuming Hardy-Weinberg Equilibrium. Consequently, the alleles in the genotype | |
| 135 with highest genotype probability may occasionally be different than the genotype obtained by | |
| 136 taking the allele with highest probability on each haplotype, which is the genotype reported | |
| 137 in the GT format field" /> | |
| 138 </section> | |
| 139 </inputs> | |
| 140 <outputs> | |
| 141 <data name="vcf_file" format="vcf" from_work_dir="out.vcf" label="${tool.name} on ${on_string}: VCF file"/> | |
| 142 <data name="log_file" format="txt" from_work_dir="out.log" label="${tool.name} on ${on_string}: log file"> | |
| 143 <filter>output_log</filter> | |
| 144 </data> | |
| 145 </outputs> | |
| 146 <tests> | |
| 147 <!-- Test default values --> | |
| 148 <test expect_num_outputs="2"> | |
| 149 <param name="gt" value="test.vcf.gz"/> | |
| 150 <param name="chrom" value="22:100-"/> | |
| 151 <param name="ne" value="1000000"/> | |
| 152 <param name="window" value="40.0"/> | |
| 153 <param name="overlap" value="2.0"/> | |
| 154 <param name="err" value="0.02"/> | |
| 155 <param name="seed" value="1"/> | |
| 156 <param name="output_log" value="true"/> | |
| 157 <section name="phasing_parameters"> | |
| 158 <param name="burnin" value="3"/> | |
| 159 <param name="iterations" value="12"/> | |
| 160 <param name="phase_states" value="280"/> | |
| 161 </section> | |
| 162 <output name="vcf_file" file="test_output.vcf" ftype="vcf" lines_diff="3"/> | |
| 163 <output name="log_file" file="test_output.log" ftype="txt" lines_diff="16"/> | |
| 164 </test> | |
| 165 <!-- Test plink file--> | |
| 166 <test expect_num_outputs="2"> | |
| 167 <param name="gt" value="test.vcf.gz"/> | |
| 168 <param name="ne" value="1000000"/> | |
| 169 <param name="window" value="30.0"/> | |
| 170 <param name="overlap" value="3.0"/> | |
| 171 <param name="output_log" value="true"/> | |
| 172 <section name="optional_inputs"> | |
| 173 <param name="map" value="plink.map"/> | |
| 174 </section> | |
| 175 <section name="phasing_parameters"> | |
| 176 <param name="burnin" value="4"/> | |
| 177 <param name="iterations" value="10"/> | |
| 178 <param name="phase_states" value="250"/> | |
| 179 </section> | |
| 180 <output name="vcf_file" ftype="vcf"> | |
| 181 <assert_contents> | |
| 182 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> | |
| 183 <has_size value="181272"/> | |
| 184 </assert_contents> | |
| 185 </output> | |
| 186 <output name="log_file" ftype="txt"> | |
| 187 <assert_contents> | |
| 188 <has_text text="Reference markers: 223"/> | |
| 189 <has_size value="1586" delta="10"/> | |
| 190 </assert_contents> | |
| 191 </output> | |
| 192 </test> | |
| 193 <!-- Test ref VCF input --> | |
| 194 <test expect_num_outputs="2"> | |
| 195 <param name="gt" value="target.vcf.gz"/> | |
| 196 <param name="ne" value="1000000"/> | |
| 197 <param name="window" value="40.0"/> | |
| 198 <param name="overlap" value="2.0"/> | |
| 199 <param name="output_log" value="true"/> | |
| 200 <section name="optional_inputs"> | |
| 201 <param name="ref" value="ref.vcf.gz"/> | |
| 202 </section> | |
| 203 <section name="imputation_parameters"> | |
| 204 <param name="impute" value="true"/> | |
| 205 <param name="imp_states" value="1600"/> | |
| 206 <param name="imp_segment" value="6.0"/> | |
| 207 <param name="imp_step" value="0.1"/> | |
| 208 <param name="cluster" value="0.005"/> | |
| 209 <param name="ap" value="true"/> | |
| 210 <param name="gp" value="true"/> | |
| 211 </section> | |
| 212 <output name="vcf_file" ftype="vcf"> | |
| 213 <assert_contents> | |
| 214 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> | |
| 215 <has_size value="18635"/> | |
| 216 </assert_contents> | |
| 217 </output> | |
| 218 <output name="log_file" ftype="txt"> | |
| 219 <assert_contents> | |
| 220 <has_text text="Reference markers: 223"/> | |
| 221 <has_size value="1801" delta="10"/> | |
| 222 </assert_contents> | |
| 223 </output> | |
| 224 </test> | |
| 225 <!-- Test ref bref3 input --> | |
| 226 <test expect_num_outputs="1"> | |
| 227 <param name="gt" value="target.vcf.gz"/> | |
| 228 <param name="ne" value="1000000"/> | |
| 229 <param name="window" value="40.0"/> | |
| 230 <param name="overlap" value="2.0"/> | |
| 231 <section name="optional_inputs"> | |
| 232 <param name="ref" value="ref.bref3"/> | |
| 233 </section> | |
| 234 <section name="imputation_parameters"> | |
| 235 <param name="impute" value="true"/> | |
| 236 <param name="imp_states" value="1600"/> | |
| 237 <param name="imp_segment" value="6.0"/> | |
| 238 <param name="imp_step" value="0.1"/> | |
| 239 <param name="cluster" value="0.005"/> | |
| 240 <param name="ap" value="true"/> | |
| 241 <param name="gp" value="true"/> | |
| 242 </section> | |
| 243 <output name="vcf_file" ftype="vcf"> | |
| 244 <assert_contents> | |
| 245 <has_text text='ID=GT,Number=1,Type=String,Description="Genotype"'/> | |
| 246 <has_size value="18635"/> | |
| 247 </assert_contents> | |
| 248 </output> | |
| 249 </test> | |
| 250 </tests> | |
| 251 <help><![CDATA[ | |
| 252 .. class:: infomark | |
| 253 | |
| 254 **Purpose** | |
| 255 | |
| 256 Beagle is a program for phasing and imputing missing genotypes. Sporadic missing | |
| 257 genotypes are imputed during phasing. If a reference panel of phased genotypes is specified | |
| 258 with the ref argument, ungenotyped markers that are present in the reference panel can also | |
| 259 be imputed. | |
| 260 | |
| 261 Beagle version 5.2 provides significantly faster genotype phasing than version 5.1. | |
| 262 Recent versions of Beagle do not infer genotypes from genotype likelihood input data, but | |
| 263 Beagle versions 4.0 and 4.1 have this capability. | |
| 264 | |
| 265 ---- | |
| 266 | |
| 267 .. class:: infomark | |
| 268 | |
| 269 **HapMap genetic maps** | |
| 270 | |
| 271 HapMap genetic maps in PLINK format for GRCh36, GRCh37, and GRCh38 are available | |
| 272 in `this link <http://bochet.gcc.biostat.washington.edu/beagle/genetic_maps/>`_ | |
| 273 | |
| 274 ---- | |
| 275 | |
| 276 .. class:: infomark | |
| 277 | |
| 278 **Input files** | |
| 279 | |
| 280 Beagle uses `Variant Call Format <http://faculty.washington.edu/browning/beagle/intro-to-vcf.html>`_ | |
| 281 (VCF) 4.3 for input and output genotype data. Pseuodoautosomal and non-pseudoautosomal | |
| 282 X-chromosome genotypes must be in separate input files and analysed separately unless male | |
| 283 haploid genotypes are coded as homozygous diploid genotypes. | |
| 284 | |
| 285 In the VCF file, if any heterozygote genotype is unphased (with "/" allele separator) in a marker window, | |
| 286 it will consider all heterozygote genotypes to be unphased, regardless of the allele separator used ("|" or "/"). | |
| 287 Beagle assumes that an the VCF file has a name ending in ".gz" is compressed with gzip or bgzip, | |
| 288 and that a reference VCF file that has a name ending in β.bref3β is compressed with bref version 3. | |
| 289 | |
| 290 ---- | |
| 291 | |
| 292 .. class:: infomark | |
| 293 | |
| 294 **Output files** | |
| 295 | |
| 296 There are two output files. The log file gives a summary of the analysis that includes the | |
| 297 Beagle version, the command line arguments, and compute time. | |
| 298 | |
| 299 The vcf.gz file is a bgzip-compressed VCF file that contains phased, non-missing | |
| 300 genotypes for all non-reference samples. The output vcf.gz file can be uncompressed with the | |
| 301 unix gunzip utility. | |
| 302 | |
| 303 If a reference panel is specified and ungenotyped markers are imputed, the VCF INFO | |
| 304 field will contain: | |
| 305 | |
| 306 :: | |
| 307 | |
| 308 - A "DR2" subfield with the estimated squared correlation between the estimated allele dose and the true allele dose. | |
| 309 - An "AF" subfield with the estimated alternate allele frequencies in the target samples. | |
| 310 - The "IMP" flag if the marker is imputed. | |
| 311 | |
| 312 ]]> </help> | |
| 313 <expand macro="citations" /> | |
| 314 </tool> |
