1
|
1 <tool id="ideas" name="IDEAS" version="1.2.0">
|
0
|
2 <description>accounts for position dependent epigenetic events and detects local cell type relationships</description>
|
|
3 <requirements>
|
|
4 <requirement type="package" version="2.26.0">bedtools</requirement>
|
|
5 <requirement type="package" version="332">ucsc-bedgraphtobigwig</requirement>
|
|
6 <requirement type="package" version="332">ucsc-bedsort</requirement>
|
|
7 <requirement type="package" version="332">ucsc-bigwigaverageoverbed</requirement>
|
|
8 <requirement type="package" version="1.2.0">ideas</requirement>
|
20
|
9 <requirement type="package" version="1.3.2">r-optparse</requirement>
|
0
|
10 </requirements>
|
30
|
11 <command detect_errors="exit_code"><![CDATA[
|
3
|
12 #import os
|
12
|
13 #set tmp_dir = 'tmp'
|
25
|
14 #set prep_output_config = 'prep_output_config.txt'
|
27
|
15 ##set ideas_input_dir = 'ideas_input'
|
|
16 ##set ideas_matrix_input_file = $os.path.join($ideas_input_dir, 'r_matrix.txt')
|
|
17 ##mkdir -p $ideas_input_dir &&
|
3
|
18 ##############################################
|
0
|
19 ## Create the config file and prepare the data
|
3
|
20 ##############################################
|
0
|
21 #set input_type = $input_type_cond.input_type
|
25
|
22 cp '$prep_input_config' 'prep_input_config.txt' &&
|
0
|
23 prepMat
|
|
24 #if str($input_type) == 'datasets':
|
25
|
25 '$prep_input_config'
|
0
|
26 #set specify_genomic_window_cond = $input_type_cond.specify_genomic_window_cond
|
|
27 #set specify_genomic_window = $specify_genomic_window_cond.specify_genomic_window
|
|
28 #if str($specify_genomic_window) == 'yes':
|
23
|
29 -bed '$bed_input'
|
0
|
30 #else:
|
|
31 -gsz '$chromInfo'
|
|
32 -wsz $specify_genomic_window_cond.window_size
|
|
33 #set restrict_chromosomes = $specify_genomic_window_cond.restrict_chromosomes_cond.restrict_chromosomes
|
|
34 #if str($restrict_chromosomes) == 'yes':
|
|
35 #set chroms = []
|
|
36 #set chrom_repeat = $specify_genomic_window_cond.restrict_chromosomes_cond.chrom_repeat
|
|
37 #for $i in $chrom_repeat.chrom
|
|
38 $chroms.append($i)
|
|
39 #end for
|
|
40 -chr ",".join(chroms)
|
|
41 #end if
|
|
42 #end if
|
|
43 #end if
|
|
44 #set outputs_by_chr = $outputs_by_chr_cond.outputs_by_chr
|
|
45 #if str($outputs_by_chr) == 'yes':
|
|
46 -bychr
|
|
47 #end if
|
|
48 -c $reads_per_bp
|
|
49 #set blacklist_regions = $blacklist_regions_cond.blacklist_regions
|
|
50 #if str($blacklist_regions) == 'yes':
|
|
51 -exclude '$blacklist_regions_cond.blacklist_input'
|
|
52 #end if
|
|
53 #set standardize_datasets = $standardize_datasets_cond.standardize_datasets
|
|
54 #if str($standardize_datasets) == 'yes':
|
|
55 -norm
|
|
56 #end if
|
3
|
57 ##############################################
|
29
|
58 ## Coerce the prepMat config output to the
|
26
|
59 ## format expected by the R matrix builder.
|
|
60 ##############################################
|
|
61 && cut -d' ' $prep_input_config -f1,2 > file1.txt
|
29
|
62 && ls tmp/*.bed.gz > file2.txt
|
26
|
63 && paste <(cat file1.txt) <(cat file2.txt) > $prep_output_config
|
|
64 ##############################################
|
3
|
65 ## Build the R matrix from the prepMat output
|
|
66 ##############################################
|
25
|
67 ##&& Rscript '$__tool_directory__/build_matrix.R'
|
|
68 ##-i $tmp_dir/*.bed.gz
|
|
69 ##-o $ideas_matrix_input_file
|
|
70 ##-w $ideas_input_dir
|
3
|
71 ##############################################
|
|
72 ## Run IDEAS on the R matrix
|
|
73 ##############################################
|
0
|
74 && ideas
|
25
|
75 '$prep_output_config'
|
26
|
76 $tmp_dir/*.bed
|
0
|
77 #set smoother_annotation = $smoother_annotation_cond.smoother_annotation
|
|
78 #if str($smoother_annotation) == 'yes':
|
|
79 -hp
|
|
80 #end if
|
|
81 #set smoother_annotation = $smoother_annotation_cond.smoother_annotation
|
|
82 #if str($smoother_annotation) == 'yes':
|
|
83 -hp
|
|
84 #end if
|
|
85 #set in_windows = $in_windows_cond.in_windows
|
|
86 #if str($in_windows) == 'yes':
|
|
87 -inv $window_start $window_end
|
|
88 #end if
|
|
89 #set log2_transformation = $log2_transformation_cond.log2_transformation
|
|
90 #if str($log2_transformation) == 'yes':
|
|
91 #set log2_num = $log2_transformation.log2_num
|
|
92 -log2
|
|
93 #if str($log2_num) != '0':
|
|
94 $log2_num
|
|
95 #end if
|
|
96 #end if
|
|
97 #set max_states_inferred = $max_states_inferred_cond.max_states_inferred
|
|
98 #if str($max_states_inferred) == 'yes':
|
|
99 -G $max_states_inferred_cond.max_states
|
|
100 #end if
|
|
101 #set num_initial_states = $num_initial_states_cond.num_initial_states
|
|
102 #if str($num_initial_states) == 'yes':
|
|
103 -C $num_initial_states_cond.initial_states
|
|
104 #end if
|
|
105 #if str($max_position_classes) != '0':
|
|
106 -P $max_position_classes
|
|
107 #end if
|
|
108 #if str($max_cell_type_clusters) != '0':
|
|
109 -K $max_cell_type_clusters
|
|
110 #end if
|
|
111 #if str($prior_concentration) != '0':
|
|
112 -A $prior_concentration
|
|
113 #end if
|
|
114 #set burnin_max_steps = $burnin_max_steps_cond.burnin_max_steps
|
|
115 #if str($burnin_max_steps) == 'yes':
|
|
116 -sample $burnin_max_steps_cond.burnin_num $burnin_max_steps_cond.mcmc_num
|
|
117 #end if
|
|
118 #set set_min_standard_dev = $set_min_standard_dev_cond.set_min_standard_dev
|
|
119 #if str($set_min_standard_dev) == 'yes':
|
|
120 -minerr $set_min_standard_dev_cond.min_standard_dev
|
|
121 #end if
|
|
122 #set set_max_standard_dev = $set_max_standard_dev_cond.set_max_standard_dev
|
|
123 #if str($set_max_standard_dev) == 'yes':
|
|
124 -maxerr $set_max_standard_dev_cond.max_standard_dev
|
|
125 #end if
|
|
126 -thread \${GALAXY_SLOTS:-4}
|
28
|
127 > $output_log
|
27
|
128 && mv *.cluster $output_cluster
|
|
129 && mv *.para $output_para
|
|
130 && mv *.profile $output_profile
|
|
131 && mv *.state $output_state
|
0
|
132 ]]></command>
|
|
133 <configfiles>
|
25
|
134 <configfile name="prep_input_config"><![CDATA[#for $input_items in $input_type_cond.input_repeat:
|
0
|
135 ${input_items.cell_type_name} ${input_items.epigenetic_factor_name} ${input_items.input}
|
|
136 #end for ]]></configfile>
|
|
137 </configfiles>
|
|
138 <inputs>
|
|
139 <conditional name="input_type_cond">
|
|
140 <param name="input_type" type="select" label="Select input type">
|
|
141 <option value="datasets" selected="true">Bam, BigWig files</option>
|
|
142 <option value="data_matrix">Data matrix</option>
|
|
143 </param>
|
|
144 <when value="datasets">
|
|
145 <repeat name="input_repeat" title="Cell type, Epigenetic factor and Input" min="1">
|
|
146 <param name="cell_type_name" type="text" value="" label="Cell type name"/>
|
|
147 <param name="epigenetic_factor_name" type="text" value="" label="Epigenetic factor name"/>
|
|
148 <param name="input" type="data" format="bigwig,bam" label="BAM or BigWig file">
|
|
149 <validator type="unspecified_build"/>
|
|
150 </param>
|
|
151 </repeat>
|
|
152 <conditional name="specify_genomic_window_cond">
|
|
153 <param name="specify_genomic_window" type="select" label="Select Bed file that defines genomic windows on which to process the data">
|
|
154 <option value="no" selected="true">No</option>
|
|
155 <option value="yes">Yes</option>
|
|
156 </param>
|
|
157 <when value="no">
|
|
158 <param name="window_size" type="integer" value="200" label="Window size in base pairs"/>
|
|
159 <conditional name="restrict_chromosomes_cond">
|
|
160 <param name="restrict_chromosomes" type="select" label="Restrict processing to specified chromosomes">
|
|
161 <option value="no" selected="true">No</option>
|
|
162 <option value="yes">Yes</option>
|
|
163 </param>
|
|
164 <when value="no"/>
|
|
165 <when value="yes">
|
|
166 <repeat name="chrom_repeat" title="Chromosomes" min="1">
|
|
167 <param name="chrom" type="text" value="" label="Chromosome"/>
|
|
168 </repeat>
|
|
169 </when>
|
|
170 </conditional>
|
|
171 </when>
|
|
172 <when value="yes">
|
|
173 <param name="bed_input" type="data" format="bed" label="Bed file specifying the genomic windows"/>
|
|
174 </when>
|
|
175 </conditional>
|
|
176 </when>
|
|
177 <when value="data_matrix"/>
|
|
178 </conditional>
|
|
179 <conditional name="outputs_by_chr_cond">
|
|
180 <param name="outputs_by_chr" type="select" display="radio" label="Output chromosomes in seperate files">
|
|
181 <option value="no" selected="true">No</option>
|
|
182 <option value="yes">Yes</option>
|
|
183 </param>
|
|
184 <when value="no"/>
|
|
185 <when value="yes"/>
|
|
186 </conditional>
|
|
187 <param name="reads_per_bp" type="integer" value="1" min="1" max="8" label="Number of reads per base pair for calculating the average signal in each genomic window"/>
|
|
188 <conditional name="blacklist_regions_cond">
|
|
189 <param name="blacklist_regions" type="select" label="Select Bed file containing blacklist regions for exclusion">
|
|
190 <option value="no" selected="true">No</option>
|
|
191 <option value="yes">Yes</option>
|
|
192 </param>
|
|
193 <when value="no"/>
|
|
194 <when value="yes">
|
|
195 <param name="blacklist_input" type="data" format="bed" label="Bed file containing regions to exclude"/>
|
|
196 </when>
|
|
197 </conditional>
|
|
198 <conditional name="standardize_datasets_cond">
|
|
199 <param name="standardize_datasets" type="select" display="radio" label="Standardize all datasets">
|
|
200 <option value="no" selected="true">No</option>
|
|
201 <option value="yes">Yes</option>
|
|
202 </param>
|
|
203 <when value="no"/>
|
|
204 <when value="yes"/>
|
|
205 </conditional>
|
|
206 <conditional name="smoother_annotation_cond">
|
|
207 <param name="smoother_annotation" type="select" display="radio" label="Discourage state transition across chromosomes">
|
|
208 <option value="no" selected="true">No</option>
|
|
209 <option value="yes">Yes</option>
|
|
210 </param>
|
|
211 <when value="no"/>
|
|
212 <when value="yes"/>
|
|
213 </conditional>
|
|
214 <conditional name="in_windows_cond">
|
|
215 <param name="in_windows" type="select" display="radio" label="Run IDEAS only within defined windows in the input data">
|
|
216 <option value="no" selected="true">No</option>
|
|
217 <option value="yes">Yes</option>
|
|
218 </param>
|
|
219 <when value="no"/>
|
|
220 <when value="yes">
|
|
221 <param name="window_start" type="integer" value="0" min="0" label="Window start" help="Zero-based"/>
|
|
222 <param name="window_end" type="integer" value="0" min="0" label="Window end" help="Zero-based"/>
|
|
223 </when>
|
|
224 </conditional>
|
|
225 <conditional name="log2_transformation_cond">
|
|
226 <param name="log2_transformation" type="select" label="Perform Log2-transformation of the input data">
|
|
227 <option value="no" selected="true">No</option>
|
|
228 <option value="yes">Yes</option>
|
|
229 </param>
|
|
230 <when value="no"/>
|
|
231 <when value="yes">
|
|
232 <param name="log2_num" type="float" value="0" min="0" max="1" label="Enter a number to use log2(x+num) transformation" help="Zero value has no affect"/>
|
|
233 </when>
|
|
234 </conditional>
|
|
235 <conditional name="max_states_inferred_cond">
|
|
236 <param name="max_states_inferred" type="select" label="Set the maximum number of states to be inferred">
|
|
237 <option value="no" selected="true">No</option>
|
|
238 <option value="yes">Yes</option>
|
|
239 </param>
|
|
240 <when value="no"/>
|
|
241 <when value="yes">
|
|
242 <param name="max_states" type="float" value="1" min="1" label="Maximum number of states to be inferred"/>
|
|
243 </when>
|
|
244 </conditional>
|
|
245 <conditional name="num_initial_states_cond">
|
|
246 <param name="num_initial_states" type="select" label="Set the initial number of states">
|
|
247 <option value="no" selected="true">No</option>
|
|
248 <option value="yes">Yes</option>
|
|
249 </param>
|
|
250 <when value="no"/>
|
|
251 <when value="yes">
|
|
252 <param name="initial_states" type="integer" value="20" min="1" label="Initial number of states"/>
|
|
253 </when>
|
|
254 </conditional>
|
|
255 <param name="max_position_classes" type="integer" value="0" min="0" label="Maximum number of position classes to be inferred" help="Zero value has no affect"/>
|
|
256 <param name="max_cell_type_clusters" type="integer" value="0" min="0" label="Maximum number of cell type clusters allowed" help="Zero value has no affect"/>
|
|
257 <param name="prior_concentration" type="float" value="0" min="0" label="Prior concentration" help="Zero value results in the default value: sqrt(number of cell types)"/>
|
|
258 <conditional name="burnin_max_steps_cond">
|
|
259 <param name="burnin_max_steps" type="select" label="Set the the number of burnin and maximization steps">
|
|
260 <option value="no" selected="true">No</option>
|
|
261 <option value="yes">Yes</option>
|
|
262 </param>
|
|
263 <when value="no"/>
|
|
264 <when value="yes">
|
|
265 <param name="burnin_num" type="integer" value="50" min="1" label="Number of burnin steps"/>
|
|
266 <param name="mcmc_num" type="integer" value="50" min="1" label="Number of maximization steps"/>
|
|
267 </when>
|
|
268 </conditional>
|
|
269 <conditional name="set_min_standard_dev_cond">
|
|
270 <param name="set_min_standard_dev" type="select" label="Set the minimum standard deviation for the emission Gaussian distribution?">
|
|
271 <option value="no" selected="true">No</option>
|
|
272 <option value="yes">Yes</option>
|
|
273 </param>
|
|
274 <when value="no"/>
|
|
275 <when value="yes">
|
|
276 <param name="min_standard_dev" type="float" value="0.5" label="Minimum standard deviation for the emission Gaussian distribution"/>
|
|
277 </when>
|
|
278 </conditional>
|
|
279 <conditional name="set_max_standard_dev_cond">
|
|
280 <param name="set_max_standard_dev" type="select" label="Set the maximum standard deviation for the emission Gaussian distribution?">
|
|
281 <option value="no" selected="true">No</option>
|
|
282 <option value="yes">Yes</option>
|
|
283 </param>
|
|
284 <when value="no"/>
|
|
285 <when value="yes">
|
|
286 <param name="max_standard_dev" type="float" value="100000000" label="Maximum standard deviation for the emission Gaussian distribution"/>
|
|
287 </when>
|
|
288 </conditional>
|
|
289 </inputs>
|
|
290 <outputs>
|
28
|
291 <data name="output_log" format="txt" label="${tool.name} (ideas output log) on ${on_string}"/>
|
26
|
292 <data name="output_cluster" format="txt" label="${tool.name} (local cell type clustering) on ${on_string}"/>
|
|
293 <data name="output_para" format="tabular" label="${tool.name} (epigenetic state frequency, mean and variance parameters) on ${on_string}"/>
|
|
294 <data name="output_profile" format="txt" label="${tool.name} (profile) on ${on_string}"/>
|
0
|
295 <data name="output_state" format="txt" label="${tool.name} (epigenetic states and position classes) on ${on_string}"/>
|
|
296 </outputs>
|
|
297 <tests>
|
|
298 </tests>
|
|
299 <help>
|
|
300 **What it does**
|
|
301
|
|
302 Employs the IDEAS (Integrative and Discriminative Epigenome Annotation System) method for jointly and quantitatively characterizing
|
|
303 multivariate epigenetic landscapes in many cell types, tissues or conditions. The method accounts for position dependent epigenetic
|
|
304 events and detects local cell type relationships, which not only help to improve the accuracy of annotating functional classes of DNA
|
|
305 sequences, but also reveal cell type constitutive and specific loci. The method utilizes Bayesian non-parametric techniques to automatically
|
|
306 identify the best model size fitting to the data so users do not have to specify the number of states. On the other hand, users can
|
|
307 still specify the number of states if desired.
|
|
308
|
|
309 -----
|
|
310
|
|
311 **Required options**
|
|
312
|
|
313 * **Cell type, Epigenetic factor and Input** - specify any number of inputs with currently supported formats, either bam or bigwig. The cell name + factor name must be unique for each input. For example, if you have replicate data you may want to specify the cell name as "cell_rep1", "cell_rep2", etc and the factor name as "factor_rep1", "factor_rep2", etc.
|
|
314
|
|
315 * **Cell type name** - cell type name
|
|
316 * **Epigenetic factor name** - epigenetic factor name
|
|
317 * **BAM or BigWig file** - BAM or BigWig file
|
|
318
|
|
319 * **Set genomic windows on which to process the data** - if "No" is selected, IDEAS will run whole genome segmentation. If "Yes" is selected, IDEAS will segment genomes in the unit of the windows defined by the bed file. This file can be in BED3, BED4 or BED5 format, but only the first three columns (chr posst posed) will be used.
|
|
320
|
|
321 * **Window size in base pairs** - Window size in base pairs (if "No" is selected)
|
|
322 * **Restrict processing to specified chromosomes** - If "Yes" is selected, processing will be restricted to specified chromosomes
|
|
323
|
|
324 * **Chromosomes** - processing will be restricted to specified chromosomes (if "Yes" is selected)
|
|
325
|
|
326 * **Chromosome** - specified chromosome
|
|
327
|
|
328 * **Bed file specifying the genomic windows** - bed file specifying the genomic windows (if "Yes" is selected)
|
|
329
|
|
330 **Other options**
|
|
331
|
|
332 * **Output chromosomes in seperate files** - select "Yes" to produce seperate files for each chromosome, allowing you to run IDEAS on different chromosomes separately.
|
|
333 * **Select Bed file containing blacklist regions for exclusion** - select a Bed file that contains regions you'd like excluded from your datasets.
|
|
334 * **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change.
|
|
335
|
|
336 * **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision.
|
|
337 * **Run IDEAS only within defined windows in the input data** - select "Yes" to Run IDEAS only in windows between zero-based start and end indexes of windows in the input data.
|
|
338 * **Perform Log2-transformation of the input data** - select "Yes" to perform Log2-transformation of the input data by log2(x+1) (recommended for read count data to reduce skewness). You can optionally enter a number less than 1 to direct IDEAS to produce log2(x+num) transformation. For example, if your input data is mean read count per window, then 1 may be too large, but using 0.1 may be more reasonable.
|
|
339 * **Set the maximum number of states to be inferred** - select "Yes" to restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified
|
|
340 * **Set the initial number of states** - select "Yes" if the number of states you expect to generate is greater than 20. While IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect.
|
|
341 * **Maximum number of position classes to be inferred** - Set this value only if:
|
|
342
|
|
343 * you do not want position classes (e.g., for testing purposes), in this case set the value t0 1
|
|
344 * IDEAS runs slow because there are too many position classes, generally less than 100 position classes will run fine
|
|
345
|
|
346 * **Maximum number of cell type clusters allowed** - Set this value only for testing. If you set the value to 1, then all cell types will be clustered in one group.
|
|
347 * **Prior concentration** - specify the prior concentration parameter; default is A=sqrt(number of cell types). A smaller concentration parameter (e.g., 1 or less) will emphasize more on position specificity and a larger concentration parameter (e.g., 10 * number of cell types) will emphasize more on global homogeneity.
|
|
348 * **Set the the number of burnin and maximization steps** - specify the number of burnin and maximization steps; default it is 50 50. Increasing these two numbers will increase computing and only slightly increase accuracy. Decreasing these two numbers will reduce computing but may also reduce accuracy. We recommend to run IDEAS with at least 20 burnins and 20 maximizations. IDEAS will not stop even if it reaches a maximum mode.
|
|
349 * **Set the minimum standard deviation for the emission Gaussian distribution** - specify the minimum standard deviation for the emission Gaussian distribution.
|
|
350
|
|
351 * **Minimum standard deviation for the emission Gaussian distribution** - you should change the default minerr value of 0.5 if the standard deviation of your data is much smaller or much larger than 1. The first line of the output produced by IDEAS is **ysd=xxx**, which is the total standard deviation of your data. If that value is less than 0.5, you may set the minimum standard deviation to an even smaller number (e.g., xxx/2). If the standard deviation of your data is much greater than 1, (e.g., 20), you may set the minimum standard deviation to a larger value, (e.g., 5). Modifying the minimum standard deviation in the former case is more necessary than in the latter case because otherwise you may end up finding no interesting segmentations. We do not recommend setting the minimum standard deviation to be 0 or smaller, as doing so may capture some artificial and uninteresting states due to tightly clustered data, such as 0 in read counts.
|
|
352
|
|
353 * **Set the maximum standard deviation for the emission Gaussian distribution** - specify the maximim standard deviation for the emission Gaussian distribution.
|
|
354
|
|
355 * **Maximim standard deviation for the emission Gaussian distribution** - if you want to find fine-grained states you may use this option (if not used, IDEAS uses infinity), but it is rearely used unless you need more states to be inferred.
|
|
356
|
|
357 </help>
|
|
358 <citations>
|
|
359 <citation type="doi">10.1093/nar/gkw278</citation>
|
|
360 </citations>
|
|
361 </tool>
|