comparison ideas.xml @ 83:3e214a2fcec9 draft

Uploaded
author greg
date Wed, 15 Nov 2017 07:36:12 -0500
parents
children
comparison
equal deleted inserted replaced
82:0c807f99a816 83:3e214a2fcec9
1 <tool id="ideas" name="IDEAS" version="1.2.0">
2 <description>accounts for position dependent epigenetic events and detects local cell type relationships</description>
3 <requirements>
4 <requirement type="package" version="2.26.0">bedtools</requirement>
5 <requirement type="package" version="332">ucsc-bedgraphtobigwig</requirement>
6 <requirement type="package" version="332">ucsc-bedsort</requirement>
7 <requirement type="package" version="332">ucsc-bigwigaverageoverbed</requirement>
8 <requirement type="package" version="1.20">ideas</requirement>
9 <requirement type="package" version="1.4.4">r-optparse</requirement>
10 </requirements>
11 <command detect_errors="exit_code"><![CDATA[
12 #set output_dir = "output_dir"
13 #set tmp_dir = "tmp"
14 #set prep_input_config = "prep_input_config.txt"
15 #set prep_output_config = '$project_name'
16 ##############################################
17 ## Create the config file and prepare the data
18 ##############################################
19 #set specify_genomic_window = $specify_genomic_window_cond.specify_genomic_window
20 mkdir '$output_dir' &&
21 cp '$gen_prep_input_config' $prep_input_config &&
22 prepMat
23 $prep_input_config
24 #if str($specify_genomic_window) == "yes":
25 -bed '$specify_genomic_window_cond.bed_input'
26 #else:
27 -gsz '$chromInfo'
28 -wsz $specify_genomic_window_cond.window_size
29 #set restrict_chromosomes = $specify_genomic_window_cond.restrict_chromosomes_cond.restrict_chromosomes
30 #if str($restrict_chromosomes) == "yes":
31 #set chroms = []
32 #set chrom_repeat = $specify_genomic_window_cond.restrict_chromosomes_cond.chrom_repeat
33 #for $i in $chrom_repeat.chrom
34 $chroms.append($i)
35 #end for
36 -chr ",".join(chroms)
37 #end if
38 #end if
39 $bychr
40 -c $reads_per_bp
41 #if str($blacklist_input) not in ["None", ""]:
42 -exclude '$blacklist_input'
43 #end if
44 $norm
45 ##############################################
46 ## Coerce the prepMat config output to the
47 ## format expected by IDEAS.
48 ##############################################
49 && cut -d' ' $prep_input_config -f1,2 > file1.txt
50 && ls tmp/*.bed.gz > file2.txt
51 && paste <(cat file1.txt) <(cat file2.txt) -d' ' > $prep_output_config
52 ##############################################
53 ## Run IDEAS
54 ##############################################
55 && ideas
56 '$prep_output_config'
57 #set specify_genomic_window = $specify_genomic_window_cond.specify_genomic_window
58 #if str($specify_genomic_window) == "yes":
59 '$specify_genomic_window_cond.bed_input'
60 #else:
61 $tmp_dir/*.bed
62 #end if
63 $hp
64 #if str($log2) != "0.0":
65 -log2 $log2
66 #end if
67 #if str($max_states) != "0.0":
68 -G $max_states
69 #end if
70 #if str($initial_states) != "0":
71 -C $initial_states
72 #end if
73 #if str($max_position_classes) != "0":
74 -P $max_position_classes
75 #end if
76 #if str($max_cell_type_clusters) != "0":
77 -K $max_cell_type_clusters
78 #end if
79 #if str($prior_concentration) != "0.0":
80 -A $prior_concentration
81 #end if
82 -sample $burnin_num $mcmc_num
83 #if str($minerr) != "0.0":
84 -minerr $minerr
85 #end if
86 #if str($maxerr) != "0.0":
87 -maxerr $maxerr
88 #end if
89 -rseed $rseed
90 -thread \${GALAXY_SLOTS:-4}
91 #if str($save_ideas_log) == "yes":
92 > $output_log
93 #else:
94 > /dev/null
95 #end if
96 && mv ./*.cluster '$output_dir'
97 && mv ./*.para '$output_dir'
98 && mv ./*.profile '$output_dir'
99 && mv ./*.state '$output_dir'
100 && Rscript '$__tool_directory__/create_heatmap.R'
101 -i '$output_dir/prep_output_config.txt.para'
102 -o '$output_heatmap'
103 ]]></command>
104 <configfiles>
105 <configfile name="gen_prep_input_config"><![CDATA[#if str($cell_type_epigenetic_factor_cond.cell_type_epigenetic_factor) == "extract":
106 #set input_name_positions = $cell_type_epigenetic_factor_cond.input_name_positions
107 #for $i in $cell_type_epigenetic_factor_cond.input:
108 #set file_name_with_ext = $i.name
109 #assert str($file_name_with_ext).find("-") >= 0, "The selected input '%s' is invalid because it does not include the '-' character which is required when setting cell type and epigenetic factor names by extracting them from the input file names." % $file_name_with_ext
110 #set file_name = $file_name_with_ext.split(".")[0]
111 #if str($input_name_positions) == "cell_first":
112 #set cell_type_name = $file_name.split("-")[0]
113 #set epigenetic_factor_name = $file_name.split("-")[1]
114 #else:
115 #set cell_type_name = $file_name.split("-")[1]
116 #set epigenetic_factor_name = $file_name.split("-")[0]
117 #end if
118 ${cell_type_name} ${epigenetic_factor_name} ${i}
119 #end for
120 #else:
121 #for $input_items in $cell_type_epigenetic_factor_cond.input_repeat:
122 ${input_items.cell_type_name} ${input_items.epigenetic_factor_name} ${input_items.input}
123 #end for
124 #end if]]></configfile>
125 </configfiles>
126 <inputs>
127 <conditional name="cell_type_epigenetic_factor_cond">
128 <param name="cell_type_epigenetic_factor" type="select" label="Set cell type and epigenetic factor names by">
129 <option value="extract" selected="true">extracting them from the selected input file names</option>
130 <option value="manual">manually setting them for each selected input</option>
131 </param>
132 <when value="extract">
133 <param name="input" type="data" format="bigwig,bam" multiple="True" label="BAM or BigWig files">
134 <validator type="empty_field"/>
135 <validator type="unspecified_build"/>
136 </param>
137 <param name="input_name_positions" type="select" display="radio" label="Selected input file name pattern is" help="A '-' character must separate cell type and epigenetic factor names within the selected input file names">
138 <option value="cell_first" selected="true">Cell type name - Epigenetic factor name</option>
139 <option value="cell_last">Epigenetic factor name - Cell type name</option>
140 </param>
141 </when>
142 <when value="manual">
143 <repeat name="input_repeat" title="Cell type, Epigenetic factor and Input" min="1">
144 <param name="cell_type_name" type="text" value="" label="Cell type name">
145 <validator type="empty_field"/>
146 </param>
147 <param name="epigenetic_factor_name" type="text" value="" label="Epigenetic factor name">
148 <validator type="empty_field"/>
149 </param>
150 <param name="input" type="data" format="bigwig,bam" label="BAM or BigWig file">
151 <validator type="empty_field"/>
152 <validator type="unspecified_build"/>
153 </param>
154 </repeat>
155 </when>
156 </conditional>
157 <param name="project_name" type="text" value="" optional="false" label="Project name" help="Output datasets will have this base name"/>
158 <param argument="-rseed" type="integer" value="1234" min="0" max="1000000" label="Seed for IDEAS model initialization" help="Zero value generates a random seed, and this seed will be different for each job run."/>
159 <conditional name="specify_genomic_window_cond">
160 <param name="specify_genomic_window" type="select" label="Select Bed file that defines genomic windows on which to process the data">
161 <option value="no" selected="true">No</option>
162 <option value="yes">Yes</option>
163 </param>
164 <when value="no">
165 <param name="window_size" type="integer" value="200" label="Window size in base pairs"/>
166 <conditional name="restrict_chromosomes_cond">
167 <param name="restrict_chromosomes" type="select" label="Restrict processing to specified chromosomes">
168 <option value="no" selected="true">No</option>
169 <option value="yes">Yes</option>
170 </param>
171 <when value="no"/>
172 <when value="yes">
173 <repeat name="chrom_repeat" title="Chromosomes" min="1">
174 <param name="chrom" type="text" value="" label="Chromosome" help="One chromosome (e.g., chr1, chr2, chrX) per text field"/>
175 </repeat>
176 </when>
177 </conditional>
178 </when>
179 <when value="yes">
180 <param name="bed_input" type="data" format="bed" label="Bed file specifying the genomic windows"/>
181 </when>
182 </conditional>
183 <param argument="-bychr" type="boolean" truevalue="-bychr" falsevalue="" checked="False" label="Output chromosomes in separate files"/>
184 <param name="reads_per_bp" type="select" display="radio" label="Calculate the signal in each genomic window using">
185 <option value="6" selected="true">mean</option>
186 <option value="8">max</option>
187 </param>
188 <param name="blacklist_input" type="data" format="bed" optional="True" multiple="True" label="Select file(s) containing regions to exclude"/>
189 <param argument="-norm" type="boolean" truevalue="-norm" falsevalue="" checked="False" label="Standardize all datasets"/>
190 <param argument="-hp" type="boolean" truevalue="-hp" falsevalue="" checked="False" label="Discourage state transition across chromosomes"/>
191 <param name="log2" type="float" value="0" min="0" label="Use log2(x+number) transformation" help="Zero means no log2 transformation"/>
192 <param name="max_states" type="float" value="0" min="0" label="Maximum number of states to be inferred" help="Zero sets the maximum to a large number"/>
193 <param name="initial_states" type="integer" value="20" min="0" label="Initial number of states" help="Positive integer"/>
194 <param name="max_position_classes" type="integer" value="0" min="0" label="Maximum number of position classes to be inferred" help="Zero sets the maximum to a large number"/>
195 <param name="max_cell_type_clusters" type="integer" value="0" min="0" label="Maximum number of cell type clusters allowed" help="Zero sets the maximum to a large number"/>
196 <param name="prior_concentration" type="float" value="1" min="0" label="Prior concentration" help="Zero value results in the default: sqrt(number of cell types)"/>
197 <param name="burnin_num" type="integer" value="20" min="1" label="Number of burnin steps"/>
198 <param name="mcmc_num" type="integer" value="20" min="1" label="Number of maximization steps"/>
199 <param name="minerr" type="float" value="0.5" min="0" label="Minimum standard deviation for the emission Gaussian distribution" help="Zero value results in the default: 0.5"/>
200 <param name="maxerr" type="float" value="1000000" min="0" label="Maximum standard deviation for the emission Gaussian distribution" help="Zero sets the maximum to a large number"/>
201 <param name="save_ideas_log" type="select" display="radio" label="Save IDEAS log in an additional history item">
202 <option value="no" selected="true">No</option>
203 <option value="yes">Yes</option>
204 </param>
205 </inputs>
206 <outputs>
207 <data name="output_log" format="txt" label="${tool.name} (output log) on ${on_string}">
208 <filter>save_ideas_log == 'yes'</filter>
209 </data>
210 <data name="output_heatmap" format="pdf" label="${tool.name} (heatmap) on ${on_string}"/>
211 <collection name="output" type="list">
212 <discover_datasets pattern="__name__" directory="output_dir" format="txt"/>
213 </collection>
214 </outputs>
215 <tests>
216 <test>
217 <param name="cell_type_epigenetic_factor" value="extract"/>
218 <param name="input" value="e001-h3k4me3.bigwig" ftype="bigwig" dbkey="hg19"/>
219 <param name="input_name_positions" value="cell_first"/>
220 <param name="specify_genomic_window" value="yes"/>
221 <param name="bed_input" value="genomic_windows.bed" ftype="bed" dbkey="hg19"/>
222 <output name="output_state" file="output_state.txt" ftype="txt"/>
223 <output name="output_profile" file="output_profile.txt" ftype="txt"/>
224 <output name="output_para" file="output_para.txt" ftype="txt"/>
225 <output name="output_cluster" file="output_cluster.txt" ftype="txt"/>
226 <output_collection name="primary_fna" type="list">
227 <element name="3722.fna.aln" file="3722.fna.aln" ftype="fasta"/>
228 <element name="38889.fna.aln" file="38889.fna.aln" ftype="fasta"/>
229 <element name="39614.fna.aln" file="39614.fna.aln" ftype="fasta"/>
230 </output_collection>
231 </test>
232 <test>
233 <param name="cell_type_epigenetic_factor" value="manual"/>
234 <repeat name="input_repeat">
235 <param name="cell_type_name" value="e001" />
236 <param name="epigenetic_factor_name" value="h3k4me3"/>
237 <param name="input" value="e001-h3k4me3.bigwig" ftype="bigwig" dbkey="hg19"/>
238 </repeat>
239 <param name="specify_genomic_window" value="yes"/>
240 <param name="bed_input" value="genomic_windows.bed" ftype="bed" dbkey="hg19"/>
241 <output name="output_state" file="output_state.txt" ftype="txt"/>
242 <output name="output_profile" file="output_profile.txt" ftype="txt"/>
243 <output name="output_para" file="output_para.txt" ftype="txt"/>
244 <output name="output_cluster" file="output_cluster.txt" ftype="txt"/>
245 </test>
246 </tests>
247 <help>
248 **What it does**
249
250 IDEAS (an **I**\ ntegrative and **D**\ iscriminative **E**\ pigenome **A**\ nnotation **S**\ ystem) identifies
251 de novo regulatory functions from epigenetic data in multiple cell types jointly. It is a full probabilistic
252 model defined on all data, and it combines signals across both the genome and cell types to boost power. The
253 underlying assumption of IDEAS is that, because all cell types share the same underlying DNA sequences,
254 **functions of each DNA segment should be correlated**. Also, cell type specific regulation is locus-dependent,
255 and thus IDEAS uses local epigenetic landscape to **identify de novo and local cell type clusters** without
256 assuming or requiring a known global cell type relationship.
257
258 IDEAS takes as input a list of epigenetic data sets (histones, chromatin accessibility, CpG methylation, TFs, etc)
259 or any other whole-genome data sets (e.g., scores). Currently the supported data formats include BigWig and BAM.
260 All data sets will first be mapped by IDEAS to a common genomic coordinate in a selected assembly (200bp windows
261 by default, or user-provided). The user can specify regions to be considered or removed from the analysis. The
262 input data may come from one cell type/condition/individual/time point (although it does not fully utilize the
263 advantage of IDEAS), or from multiple cell types/conditions/individuals/time points. The same set of epigenetic
264 features may not be present in all cell types, for which IDEAS will do imputation of the missing tracks if
265 specified.
266
267 .. image:: $PATH_TO_IMAGES/ideas.png
268
269 IDEAS predicts regulatory functions, denoted by epigenetic states, at each position in each cell type by
270 **combining information simultaneously learned from other cell types** at the same positions in cell types with
271 similar local epigenetic landscapes. Size of genomic intervals for determining the similarity are also learned.
272 All of the inferences are done through parallel infinite-state hidden Markov models (iHMM), which is a Bayesian
273 non-parametric technique to automatically determine the number of local cell type clusters and the number of
274 epigenetic states.
275
276 In addition to its improved power, IDEAS has two unique advantages:
277
278 1) applies **linear time inference** with respect to the number of cell types, which allows it to study hundreds or more cell types jointly
279 2) uses mini-batch training to **improve reproducibility** of the predicted epigenetic states, which is important because genome segmentation is not convex and hence cannot guarantee a global optimal solution.
280
281 -----
282
283 **Options**
284
285 * **Set cell type and epigenetic factor names by** - cell type and epigenetic factor names can be set manually or by extracting them from the names of the selected input datasets. The latter case requires all selected datasets to have names that contain a "-" character.
286
287 * **BAM or BigWig files** - select one or more Bam or Bigwig files from your history, making sure that the name of every selected input include a "-" character (e.g., e001-h3k4me3.bigwig).
288 * **Cell type, Epigenetic factor and Input** - manually select any number of inputs, setting the cell type and epigenetic factor name for each. The combination of "cell type name" and "epigenetic factor name" must be unique for each input. For example, if you have replicate data you may want to specify the cell name as "rep1", "rep2", etc and the factor name as "rep1", "rep2", etc.
289
290 * **Cell type name** - cell type name
291 * **Epigenetic factor name** - epigenetic factor name
292 * **BAM or BigWig file** - BAM or BigWig file
293
294 * **Project name** - datasets produced by IDEAS will have this base name.
295 * **Seed for IDEAS model initialization** - enter an integer to be used as the seed for the IDEAS model initialization. A zero value causes IDEAS to automatically generate a random seed, and this seed will be different for each job run.
296 * **Select Bed file that defines genomic windows on which to process the data** - if "No" is selected, IDEAS will run whole genome segmentation. If "Yes" is selected, IDEAS will segment genomes in the unit of the windows defined by the bed file. This file can be in BED3, BED4 or BED5 format, but only the first three columns (chr posst posed) will be used.
297
298 * **Window size in base pairs** - Window size in base pairs (if "No" is selected)
299 * **Restrict processing to specified chromosomes** - If "Yes" is selected, processing will be restricted to specified chromosomes
300
301 * **Chromosomes** - processing will be restricted to specified chromosomes (if "Yes" is selected)
302
303 * **Bed file specifying the genomic windows** - bed file specifying the genomic windows (if "Yes" is selected)
304
305 * **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately.
306 * **Calculate the signal in each genomic window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate the signal (i.e., the number of reads per bp) in each genomic window.
307 * **Select file(s) containing regions to exclude** - select one or more bed files that contains regions you'd like excluded from your datasets.
308 * **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change.
309
310 * **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision.
311 * **Use log2(x+number) transformation** - perform Log2-transformation of the input data by log2(x+number) (recommended for read count data to reduce skewness). You can enter a number that is representative of the noise level in your data (e.g., a number less than 1). If this number is at a similar scale or larger than the signal in your data, it will lose power. For example, if your input data is mean read count per window, using 0.1 may produce better results.
312 * **Maximum number of states to be inferred** - restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified
313 * **Initial number of states** - while IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect.
314 * **Maximum number of position classes to be inferred** - Set this value only if:
315
316 * you do not want position classes (e.g., for testing purposes), in this case set the value to 1
317 * IDEAS runs slow because there are too many position classes, generally less than 100 position classes will run fine
318
319 * **Maximum number of cell type clusters allowed** - If you set the value to 1, then all cell types will be clustered in one group, which may be desirable if all cell types are homogeneous and you want IDEAS to use information in all cell types equally.
320 * **Prior concentration** - specify the prior concentration parameter; default is A=sqrt(number of cell types). A smaller concentration parameter (e.g., 1 or less) will emphasize more on position specificity and a larger concentration parameter (e.g., 10 * number of cell types) will emphasize more on global homogeneity.
321 * **Number of burnin steps** - specify the number of burnin steps; default is 20. Increasing the burnin and maximization steps will increase computing and only slightly increase accuracy, while decreasing them will reduce computing resources but may also reduce accuracy. We recommend to run IDEAS with at least 20 burnins and 20 maximizations. IDEAS will not stop even if it reaches a maximum mode.
322 * **Number of maximization steps** - specify the number of maximization steps; default is 20.
323 * **Minimum standard deviation for the emission Gaussian distribution** - This number multiplied by the overall standard deviation of your data will be used as a lower bound for the standard deviation for each factor in each epigenetic state (the default is 0.5). This number is useful for removing very subtle clusters in the data. Setting this value near 0 will allow IDEAS to discover many subtle states, while setting it greater than 1 will result in IDEAS losing the ability to detect meaningful states.
324 * **Maximim standard deviation for the emission Gaussian distribution** - if you want to find fine-grained states you may use this option (if not used, IDEAS uses infinity), but it is rearely used unless you need more states to be inferred.
325
326 </help>
327 <citations>
328 <citation type="doi">10.1093/nar/gkw278</citation>
329 </citations>
330 </tool>