comparison ideas.xml @ 180:15bd502e6a0c draft

Uploaded
author greg
date Thu, 01 Feb 2018 12:54:06 -0500
parents 12168e1e4859
children 28a995056cd0
comparison
equal deleted inserted replaced
179:f32f15562a82 180:15bd502e6a0c
11 </requirements> 11 </requirements>
12 <command detect_errors="exit_code"><![CDATA[ 12 <command detect_errors="exit_code"><![CDATA[
13 #import os 13 #import os
14 #set perform_training = $perform_training_cond.perform_training 14 #set perform_training = $perform_training_cond.perform_training
15 15
16 ## Set the value of chrom_bed_input since we don't symlink it. 16 ## Copy the input's compressed tmp directory archive.
17 #if $input.metadata.chrom_bed is None: 17 cp $input.metadata.tmp_archive . &&
18 #set chrom_bed_input = None 18 ## Extract the tmp archive.
19 #else: 19 tar -xzf tmp.tar.gz &&
20 #set chrom_bed_input = $os.path.join($input.extra_files_path, $input.metadata.chrom_bed)
21 #end if
22
23 ## Link chromosome_windows.txt if not None.
24 #set chromosome_windows = $input.metadata.chrom_windows
25 #if chromosome_windows is not None:
26 #set from_path = $os.path.join($input.extra_files_path, $chromosome_windows)
27 ln -s $from_path . &&
28 #end if
29
30 ## Link IDEAS_input_config.txt which will always exist.
31 #set ideas_input_config = $input.metadata.input_config
32 #set from_path = $os.path.join($input.extra_files_path, $ideas_input_config)
33 ln -s $from_path . &&
34
35 ## Link all files in the extra_files_path's tmp directory.
36 #set tmp_dir = "tmp"
37 #set from_path = $os.path.join($input.extra_files_path, "tmp")
38 mkdir $tmp_dir &&
39 #for f in $os.listdir($from_path):
40 ln -s $os.path.join($from_path, $f) $tmp_dir &&
41 #end for
42 20
43 ## Define and create output directories. 21 ## Define and create output directories.
44 #set output_pdf_dir = "output_pdf_dir" 22 #set output_pdf_dir = "output_pdf_dir"
45 #set output_txt_dir = "output_txt_dir" 23 #set output_txt_dir = "output_txt_dir"
46 #set output_training_dir = "output_training_dir" 24 #set output_training_dir = "output_training_dir"
58 Rscript '$__tool_directory__/ideas.R' 36 Rscript '$__tool_directory__/ideas.R'
59 --burnin_num $burnin_num 37 --burnin_num $burnin_num
60 #if str($bychr) == "true": 38 #if str($bychr) == "true":
61 --bychr true 39 --bychr true
62 #end if 40 #end if
63 --chrom_bed_input $chrom_bed_input 41 #if $input.metadata.chrom_bed is not None:
64 --chromosome_windows $chromosome_windows 42 --chrom_bed_input $input.metadata.chrom_bed
43 #end if
44 #if $input.metadata.chrom_windows is not None:
45 --chromosome_windows $input.metadata.chrom_windows
46 #end if
65 #if str($hp) == "true": 47 #if str($hp) == "true":
66 --hp true 48 --hp true
67 #end if 49 #end if
68 #if str($initial_states) != "0": 50 #if str($initial_states) != "0":
69 --initial_states $initial_states 51 --initial_states $initial_states
70 #end if 52 #end if
71 --input_files_path $input.extra_files_path 53 --ideas_input_config $input.metadata.input_config
72 --ideas_input_config $ideas_input_config
73 #if str($log2) != "0.0": 54 #if str($log2) != "0.0":
74 --log2 $log2 55 --log2 $log2
75 #end if 56 #end if
76 #if str($maxerr) != "0.0": 57 #if str($maxerr) != "0.0":
77 --maxerr $maxerr 58 --maxerr $maxerr
142 <param name="project_name" type="text" value="myProject" label="Project name" help="Outputs will have this base name"> 123 <param name="project_name" type="text" value="myProject" label="Project name" help="Outputs will have this base name">
143 <validator type="empty_field"/> 124 <validator type="empty_field"/>
144 </param> 125 </param>
145 <param name="rseed" type="integer" value="1234" min="0" max="1000000" label="Seed for IDEAS model initialization" help="Zero value generates a random seed, and this seed will be different for each job run."/> 126 <param name="rseed" type="integer" value="1234" min="0" max="1000000" label="Seed for IDEAS model initialization" help="Zero value generates a random seed, and this seed will be different for each job run."/>
146 <param name="bychr" type="boolean" truevalue="true" falsevalue="" checked="False" label="Output chromosomes in separate files"/> 127 <param name="bychr" type="boolean" truevalue="true" falsevalue="" checked="False" label="Output chromosomes in separate files"/>
147 <param name="reads_per_bp" type="select" display="radio" label="Calculate the signal in each genomic window using"> 128 <param name="reads_per_bp" type="select" display="radio" label="Calculate the signal in each window using">
148 <option value="6" selected="true">mean</option> 129 <option value="6" selected="true">mean</option>
149 <option value="8">max</option> 130 <option value="8">max</option>
150 </param> 131 </param>
151 <param name="hp" type="boolean" truevalue="true" falsevalue="" checked="False" label="Discourage state transition across chromosomes"/> 132 <param name="hp" type="boolean" truevalue="true" falsevalue="" checked="False" label="Discourage state transition across chromosomes"/>
152 <param name="log2" type="float" value="0" min="0" label="Use log2(x+number) transformation" help="Zero means no log2 transformation"/> 133 <param name="log2" type="float" value="0" min="0" label="Use log2(x+number) transformation" help="Zero means no log2 transformation"/>
186 <filter>perform_training_cond['perform_training'] == 'yes'</filter> 167 <filter>perform_training_cond['perform_training'] == 'yes'</filter>
187 </collection> 168 </collection>
188 </outputs> 169 </outputs>
189 <tests> 170 <tests>
190 <test> 171 <test>
191 <param name="perform_training" value="no"/> 172 <param name="perform_training" value="yes"/>
192 <param name="input" value="input.ideaspre" ftype="ideaspre" dbkey="hg19"/> 173 <param name="training_iterations" value="3"/>
174 <param name="input" value="ideas_test1/input.html" dbkey="hg19" ftype="ideaspre">
175 <composite_data value='ideas_test1/chromosomes.bed' dbkey="hg19"/>
176 <composite_data value='ideas_test1/chromosome_windows.txt' dbkey="hg19"/>
177 <composite_data value='ideas_test1/IDEAS_input_config.txt' dbkey="hg19"/>
178 <composite_data value='ideas_test1/tmp.tar.gz' dbkey="hg19"/>
179 <param name="output_heatmaps" value="yes"/>
180 </param>
193 <param name="project_name" value="IDEAS_out"/> 181 <param name="project_name" value="IDEAS_out"/>
194 <param name="initial_states" value="2"/>
195 <param name="maxerr" value="1000"/>
196 <param name="output_heatmaps" value="no"/>
197 <param name="save_ideas_log" value="yes"/> 182 <param name="save_ideas_log" value="yes"/>
198 <output_collection name="output_txt_collection" type="list"> 183 <output_collection name="output_training_collection" type="list">
199 <element name="IDEAS_out.chr1.cluster" file="IDEAS_out.cluster" ftype="txt"/> 184 <element name="IDEAS_out.chr1.cluster" file="IDEAS_out.cluster" ftype="txt"/>
200 <element name="IDEAS_out.chr1.para" file="IDEAS_out.para" ftype="txt"/> 185 <element name="IDEAS_out.chr2.cluster" file="IDEAS_out.cluster" ftype="txt"/>
201 <element name="IDEAS_out.chr1.profile" file="IDEAS_out.profile" ftype="txt"/> 186 <element name="IDEAS_out.chr3.cluster" file="IDEAS_out.cluster" ftype="txt"/>
202 <element name="IDEAS_out.chr1.state" file="IDEAS_out.state" ftype="txt"/> 187 <element name="IDEAS_out.chr4.cluster" file="IDEAS_out.cluster" ftype="txt"/>
188 <element name="IDEAS_out.chr5.cluster" file="IDEAS_out.cluster" ftype="txt"/>
189 <element name="IDEAS_out.chr1.state" file="IDEAS_out.chr1.state" ftype="txt"/>
190 <element name="IDEAS_out.chr2.state" file="IDEAS_out.chr2.state" ftype="txt"/>
191 <element name="IDEAS_out.chr3.state" file="IDEAS_out.chr3.state" ftype="txt"/>
192 <element name="IDEAS_out.chr4.state" file="IDEAS_out.chr4.state" ftype="txt"/>
193 <element name="IDEAS_out.chr5.state" file="IDEAS_out.chr5.state" ftype="txt"/>
194 <element name="IDEAS_out.para0" file="IDEAS_out.para0" ftype="txt"/>
195 <element name="IDEAS_out.profile0" file="IDEAS_out.profile0" ftype="txt"/>
196 </output_collection>
197 <output_collection name="output_pdf_collection" type="list">
198 <element name="IDEAS_out.state.1.pdf" file="IDEAS_out.state.1.pdf" ftype="pdf"/>
203 </output_collection> 199 </output_collection>
204 <output name="output_log" file="output_log.txt" ftype="txt" compare="contains" /> 200 <output name="output_log" file="output_log.txt" ftype="txt" compare="contains" />
205 </test> 201 </test>
206 </tests> 202 </tests>
207 <help> 203 <help>
210 IDEAS (an **I**\ ntegrative and **D**\ iscriminative **E**\ pigenome **A**\ nnotation **S**\ ystem) identifies 206 IDEAS (an **I**\ ntegrative and **D**\ iscriminative **E**\ pigenome **A**\ nnotation **S**\ ystem) identifies
211 de novo regulatory functions from epigenetic data in multiple cell types jointly. It is a full probabilistic 207 de novo regulatory functions from epigenetic data in multiple cell types jointly. It is a full probabilistic
212 model defined on all data, and it combines signals across both the genome and cell types to boost power. The 208 model defined on all data, and it combines signals across both the genome and cell types to boost power. The
213 underlying assumption of IDEAS is that, because all cell types share the same underlying DNA sequences, 209 underlying assumption of IDEAS is that, because all cell types share the same underlying DNA sequences,
214 **functions of each DNA segment should be correlated**. Also, cell type specific regulation is locus-dependent, 210 **functions of each DNA segment should be correlated**. Also, cell type specific regulation is locus-dependent,
215 and thus IDEAS uses local epigenetic landscape to **identify de novo and local cell type clusters** without 211 and so IDEAS uses local epigenetic landscape to **identify de novo and local cell type clusters** without
216 assuming or requiring a known global cell type relationship. 212 assuming or requiring a known global cell type relationship.
217 213
218 IDEAS takes as input a list of epigenetic data sets (histones, chromatin accessibility, CpG methylation, TFs, etc) 214 The input is a single dataset with the **IdeasPre** datatype, which is produced by the IDEAS Preprocessor tool.
219 or any other whole-genome data sets (e.g., scores). Currently the supported data formats include BigWig and BAM.
220 All data sets will first be mapped by IDEAS to a common genomic coordinate in a selected assembly (200bp windows
221 by default, or user-provided). The user can specify regions to be considered or removed from the analysis. The
222 input data may come from one cell type/condition/individual/time point (although it does not fully utilize the
223 advantage of IDEAS), or from multiple cell types/conditions/individuals/time points. The same set of epigenetic
224 features may not be present in all cell types, for which IDEAS will do imputation of the missing tracks if
225 specified.
226 215
227 .. image:: $PATH_TO_IMAGES/ideas.png 216 .. image:: $PATH_TO_IMAGES/ideas.png
228 217
229 IDEAS predicts regulatory functions, denoted by epigenetic states, at each position in each cell type by 218 IDEAS predicts regulatory functions, denoted by epigenetic states, at each position in each cell type by
230 **combining information simultaneously learned from other cell types** at the same positions in cell types with 219 **combining information simultaneously learned from other cell types** at the same positions in cell types with
256 * **Epigenetic factor name** - epigenetic factor name 245 * **Epigenetic factor name** - epigenetic factor name
257 * **BAM or BigWig file** - BAM or BigWig file 246 * **BAM or BigWig file** - BAM or BigWig file
258 247
259 * **Project name** - datasets produced by IDEAS will have this base name. 248 * **Project name** - datasets produced by IDEAS will have this base name.
260 * **Seed for IDEAS model initialization** - enter an integer to be used as the seed for the IDEAS model initialization. A zero value causes IDEAS to automatically generate a random seed, and this seed will be different for each job run. 249 * **Seed for IDEAS model initialization** - enter an integer to be used as the seed for the IDEAS model initialization. A zero value causes IDEAS to automatically generate a random seed, and this seed will be different for each job run.
261 * **Select Bed file that defines genomic windows on which to process the data** - if "No" is selected, IDEAS will run whole genome segmentation. If "Yes" is selected, IDEAS will segment genomes in the unit of the windows defined by the bed file. This file can be in BED3, BED4 or BED5 format, but only the first three columns (chr posst posed) will be used.
262
263 * **Window size in base pairs** - Window size in base pairs (if "No" is selected)
264 * **Restrict processing to specified chromosomes** - If "Yes" is selected, processing will be restricted to specified chromosomes
265
266 * **Chromosomes** - processing will be restricted to specified chromosomes (if "Yes" is selected)
267
268 * **Bed file specifying the genomic windows** - bed file specifying the genomic windows (if "Yes" is selected)
269
270 * **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately. 250 * **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately.
271 * **Calculate the signal in each genomic window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate the signal (i.e., the number of reads per bp) in each genomic window. 251 * **Calculate the signal in each window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate the signal (i.e., the number of reads per bp) in each window.
272 * **Select file(s) containing regions to exclude** - select one or more bed files that contains regions you'd like excluded from your datasets.
273 * **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change. 252 * **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change.
274 * **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision. 253 * **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision.
275 * **Use log2(x+number) transformation** - perform Log2-transformation of the input data by log2(x+number) (recommended for read count data to reduce skewness). You can enter a number that is representative of the noise level in your data (e.g., a number less than 1). If this number is at a similar scale or larger than the signal in your data, it will lose power. For example, if your input data is mean read count per window, using 0.1 may produce better results. 254 * **Use log2(x+number) transformation** - perform Log2-transformation of the input data by log2(x+number) (recommended for read count data to reduce skewness). You can enter a number that is representative of the noise level in your data (e.g., a number less than 1). If this number is at a similar scale or larger than the signal in your data, it will lose power. For example, if your input data is mean read count per window, using 0.1 may produce better results.
276 * **Maximum number of states to be inferred** - restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified 255 * **Maximum number of states to be inferred** - restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified
277 * **Initial number of states** - while IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect. 256 * **Initial number of states** - while IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect.