annotate ideas.xml @ 83:3e214a2fcec9 draft

Uploaded
author greg
date Wed, 15 Nov 2017 07:36:12 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
83
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
1 <tool id="ideas" name="IDEAS" version="1.2.0">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
2 <description>accounts for position dependent epigenetic events and detects local cell type relationships</description>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
3 <requirements>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
4 <requirement type="package" version="2.26.0">bedtools</requirement>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
5 <requirement type="package" version="332">ucsc-bedgraphtobigwig</requirement>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
6 <requirement type="package" version="332">ucsc-bedsort</requirement>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
7 <requirement type="package" version="332">ucsc-bigwigaverageoverbed</requirement>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
8 <requirement type="package" version="1.20">ideas</requirement>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
9 <requirement type="package" version="1.4.4">r-optparse</requirement>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
10 </requirements>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
11 <command detect_errors="exit_code"><![CDATA[
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
12 #set output_dir = "output_dir"
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
13 #set tmp_dir = "tmp"
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
14 #set prep_input_config = "prep_input_config.txt"
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
15 #set prep_output_config = '$project_name'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
16 ##############################################
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
17 ## Create the config file and prepare the data
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
18 ##############################################
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
19 #set specify_genomic_window = $specify_genomic_window_cond.specify_genomic_window
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
20 mkdir '$output_dir' &&
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
21 cp '$gen_prep_input_config' $prep_input_config &&
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
22 prepMat
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
23 $prep_input_config
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
24 #if str($specify_genomic_window) == "yes":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
25 -bed '$specify_genomic_window_cond.bed_input'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
26 #else:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
27 -gsz '$chromInfo'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
28 -wsz $specify_genomic_window_cond.window_size
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
29 #set restrict_chromosomes = $specify_genomic_window_cond.restrict_chromosomes_cond.restrict_chromosomes
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
30 #if str($restrict_chromosomes) == "yes":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
31 #set chroms = []
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
32 #set chrom_repeat = $specify_genomic_window_cond.restrict_chromosomes_cond.chrom_repeat
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
33 #for $i in $chrom_repeat.chrom
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
34 $chroms.append($i)
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
35 #end for
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
36 -chr ",".join(chroms)
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
37 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
38 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
39 $bychr
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
40 -c $reads_per_bp
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
41 #if str($blacklist_input) not in ["None", ""]:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
42 -exclude '$blacklist_input'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
43 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
44 $norm
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
45 ##############################################
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
46 ## Coerce the prepMat config output to the
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
47 ## format expected by IDEAS.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
48 ##############################################
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
49 && cut -d' ' $prep_input_config -f1,2 > file1.txt
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
50 && ls tmp/*.bed.gz > file2.txt
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
51 && paste <(cat file1.txt) <(cat file2.txt) -d' ' > $prep_output_config
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
52 ##############################################
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
53 ## Run IDEAS
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
54 ##############################################
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
55 && ideas
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
56 '$prep_output_config'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
57 #set specify_genomic_window = $specify_genomic_window_cond.specify_genomic_window
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
58 #if str($specify_genomic_window) == "yes":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
59 '$specify_genomic_window_cond.bed_input'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
60 #else:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
61 $tmp_dir/*.bed
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
62 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
63 $hp
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
64 #if str($log2) != "0.0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
65 -log2 $log2
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
66 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
67 #if str($max_states) != "0.0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
68 -G $max_states
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
69 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
70 #if str($initial_states) != "0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
71 -C $initial_states
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
72 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
73 #if str($max_position_classes) != "0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
74 -P $max_position_classes
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
75 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
76 #if str($max_cell_type_clusters) != "0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
77 -K $max_cell_type_clusters
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
78 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
79 #if str($prior_concentration) != "0.0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
80 -A $prior_concentration
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
81 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
82 -sample $burnin_num $mcmc_num
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
83 #if str($minerr) != "0.0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
84 -minerr $minerr
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
85 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
86 #if str($maxerr) != "0.0":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
87 -maxerr $maxerr
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
88 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
89 -rseed $rseed
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
90 -thread \${GALAXY_SLOTS:-4}
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
91 #if str($save_ideas_log) == "yes":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
92 > $output_log
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
93 #else:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
94 > /dev/null
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
95 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
96 && mv ./*.cluster '$output_dir'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
97 && mv ./*.para '$output_dir'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
98 && mv ./*.profile '$output_dir'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
99 && mv ./*.state '$output_dir'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
100 && Rscript '$__tool_directory__/create_heatmap.R'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
101 -i '$output_dir/prep_output_config.txt.para'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
102 -o '$output_heatmap'
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
103 ]]></command>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
104 <configfiles>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
105 <configfile name="gen_prep_input_config"><![CDATA[#if str($cell_type_epigenetic_factor_cond.cell_type_epigenetic_factor) == "extract":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
106 #set input_name_positions = $cell_type_epigenetic_factor_cond.input_name_positions
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
107 #for $i in $cell_type_epigenetic_factor_cond.input:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
108 #set file_name_with_ext = $i.name
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
109 #assert str($file_name_with_ext).find("-") >= 0, "The selected input '%s' is invalid because it does not include the '-' character which is required when setting cell type and epigenetic factor names by extracting them from the input file names." % $file_name_with_ext
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
110 #set file_name = $file_name_with_ext.split(".")[0]
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
111 #if str($input_name_positions) == "cell_first":
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
112 #set cell_type_name = $file_name.split("-")[0]
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
113 #set epigenetic_factor_name = $file_name.split("-")[1]
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
114 #else:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
115 #set cell_type_name = $file_name.split("-")[1]
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
116 #set epigenetic_factor_name = $file_name.split("-")[0]
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
117 #end if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
118 ${cell_type_name} ${epigenetic_factor_name} ${i}
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
119 #end for
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
120 #else:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
121 #for $input_items in $cell_type_epigenetic_factor_cond.input_repeat:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
122 ${input_items.cell_type_name} ${input_items.epigenetic_factor_name} ${input_items.input}
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
123 #end for
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
124 #end if]]></configfile>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
125 </configfiles>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
126 <inputs>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
127 <conditional name="cell_type_epigenetic_factor_cond">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
128 <param name="cell_type_epigenetic_factor" type="select" label="Set cell type and epigenetic factor names by">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
129 <option value="extract" selected="true">extracting them from the selected input file names</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
130 <option value="manual">manually setting them for each selected input</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
131 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
132 <when value="extract">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
133 <param name="input" type="data" format="bigwig,bam" multiple="True" label="BAM or BigWig files">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
134 <validator type="empty_field"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
135 <validator type="unspecified_build"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
136 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
137 <param name="input_name_positions" type="select" display="radio" label="Selected input file name pattern is" help="A '-' character must separate cell type and epigenetic factor names within the selected input file names">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
138 <option value="cell_first" selected="true">Cell type name - Epigenetic factor name</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
139 <option value="cell_last">Epigenetic factor name - Cell type name</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
140 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
141 </when>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
142 <when value="manual">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
143 <repeat name="input_repeat" title="Cell type, Epigenetic factor and Input" min="1">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
144 <param name="cell_type_name" type="text" value="" label="Cell type name">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
145 <validator type="empty_field"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
146 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
147 <param name="epigenetic_factor_name" type="text" value="" label="Epigenetic factor name">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
148 <validator type="empty_field"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
149 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
150 <param name="input" type="data" format="bigwig,bam" label="BAM or BigWig file">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
151 <validator type="empty_field"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
152 <validator type="unspecified_build"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
153 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
154 </repeat>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
155 </when>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
156 </conditional>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
157 <param name="project_name" type="text" value="" optional="false" label="Project name" help="Output datasets will have this base name"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
158 <param argument="-rseed" type="integer" value="1234" min="0" max="1000000" label="Seed for IDEAS model initialization" help="Zero value generates a random seed, and this seed will be different for each job run."/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
159 <conditional name="specify_genomic_window_cond">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
160 <param name="specify_genomic_window" type="select" label="Select Bed file that defines genomic windows on which to process the data">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
161 <option value="no" selected="true">No</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
162 <option value="yes">Yes</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
163 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
164 <when value="no">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
165 <param name="window_size" type="integer" value="200" label="Window size in base pairs"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
166 <conditional name="restrict_chromosomes_cond">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
167 <param name="restrict_chromosomes" type="select" label="Restrict processing to specified chromosomes">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
168 <option value="no" selected="true">No</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
169 <option value="yes">Yes</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
170 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
171 <when value="no"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
172 <when value="yes">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
173 <repeat name="chrom_repeat" title="Chromosomes" min="1">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
174 <param name="chrom" type="text" value="" label="Chromosome" help="One chromosome (e.g., chr1, chr2, chrX) per text field"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
175 </repeat>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
176 </when>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
177 </conditional>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
178 </when>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
179 <when value="yes">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
180 <param name="bed_input" type="data" format="bed" label="Bed file specifying the genomic windows"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
181 </when>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
182 </conditional>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
183 <param argument="-bychr" type="boolean" truevalue="-bychr" falsevalue="" checked="False" label="Output chromosomes in separate files"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
184 <param name="reads_per_bp" type="select" display="radio" label="Calculate the signal in each genomic window using">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
185 <option value="6" selected="true">mean</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
186 <option value="8">max</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
187 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
188 <param name="blacklist_input" type="data" format="bed" optional="True" multiple="True" label="Select file(s) containing regions to exclude"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
189 <param argument="-norm" type="boolean" truevalue="-norm" falsevalue="" checked="False" label="Standardize all datasets"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
190 <param argument="-hp" type="boolean" truevalue="-hp" falsevalue="" checked="False" label="Discourage state transition across chromosomes"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
191 <param name="log2" type="float" value="0" min="0" label="Use log2(x+number) transformation" help="Zero means no log2 transformation"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
192 <param name="max_states" type="float" value="0" min="0" label="Maximum number of states to be inferred" help="Zero sets the maximum to a large number"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
193 <param name="initial_states" type="integer" value="20" min="0" label="Initial number of states" help="Positive integer"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
194 <param name="max_position_classes" type="integer" value="0" min="0" label="Maximum number of position classes to be inferred" help="Zero sets the maximum to a large number"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
195 <param name="max_cell_type_clusters" type="integer" value="0" min="0" label="Maximum number of cell type clusters allowed" help="Zero sets the maximum to a large number"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
196 <param name="prior_concentration" type="float" value="1" min="0" label="Prior concentration" help="Zero value results in the default: sqrt(number of cell types)"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
197 <param name="burnin_num" type="integer" value="20" min="1" label="Number of burnin steps"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
198 <param name="mcmc_num" type="integer" value="20" min="1" label="Number of maximization steps"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
199 <param name="minerr" type="float" value="0.5" min="0" label="Minimum standard deviation for the emission Gaussian distribution" help="Zero value results in the default: 0.5"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
200 <param name="maxerr" type="float" value="1000000" min="0" label="Maximum standard deviation for the emission Gaussian distribution" help="Zero sets the maximum to a large number"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
201 <param name="save_ideas_log" type="select" display="radio" label="Save IDEAS log in an additional history item">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
202 <option value="no" selected="true">No</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
203 <option value="yes">Yes</option>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
204 </param>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
205 </inputs>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
206 <outputs>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
207 <data name="output_log" format="txt" label="${tool.name} (output log) on ${on_string}">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
208 <filter>save_ideas_log == 'yes'</filter>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
209 </data>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
210 <data name="output_heatmap" format="pdf" label="${tool.name} (heatmap) on ${on_string}"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
211 <collection name="output" type="list">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
212 <discover_datasets pattern="__name__" directory="output_dir" format="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
213 </collection>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
214 </outputs>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
215 <tests>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
216 <test>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
217 <param name="cell_type_epigenetic_factor" value="extract"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
218 <param name="input" value="e001-h3k4me3.bigwig" ftype="bigwig" dbkey="hg19"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
219 <param name="input_name_positions" value="cell_first"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
220 <param name="specify_genomic_window" value="yes"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
221 <param name="bed_input" value="genomic_windows.bed" ftype="bed" dbkey="hg19"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
222 <output name="output_state" file="output_state.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
223 <output name="output_profile" file="output_profile.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
224 <output name="output_para" file="output_para.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
225 <output name="output_cluster" file="output_cluster.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
226 <output_collection name="primary_fna" type="list">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
227 <element name="3722.fna.aln" file="3722.fna.aln" ftype="fasta"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
228 <element name="38889.fna.aln" file="38889.fna.aln" ftype="fasta"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
229 <element name="39614.fna.aln" file="39614.fna.aln" ftype="fasta"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
230 </output_collection>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
231 </test>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
232 <test>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
233 <param name="cell_type_epigenetic_factor" value="manual"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
234 <repeat name="input_repeat">
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
235 <param name="cell_type_name" value="e001" />
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
236 <param name="epigenetic_factor_name" value="h3k4me3"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
237 <param name="input" value="e001-h3k4me3.bigwig" ftype="bigwig" dbkey="hg19"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
238 </repeat>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
239 <param name="specify_genomic_window" value="yes"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
240 <param name="bed_input" value="genomic_windows.bed" ftype="bed" dbkey="hg19"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
241 <output name="output_state" file="output_state.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
242 <output name="output_profile" file="output_profile.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
243 <output name="output_para" file="output_para.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
244 <output name="output_cluster" file="output_cluster.txt" ftype="txt"/>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
245 </test>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
246 </tests>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
247 <help>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
248 **What it does**
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
249
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
250 IDEAS (an **I**\ ntegrative and **D**\ iscriminative **E**\ pigenome **A**\ nnotation **S**\ ystem) identifies
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
251 de novo regulatory functions from epigenetic data in multiple cell types jointly. It is a full probabilistic
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
252 model defined on all data, and it combines signals across both the genome and cell types to boost power. The
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
253 underlying assumption of IDEAS is that, because all cell types share the same underlying DNA sequences,
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
254 **functions of each DNA segment should be correlated**. Also, cell type specific regulation is locus-dependent,
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
255 and thus IDEAS uses local epigenetic landscape to **identify de novo and local cell type clusters** without
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
256 assuming or requiring a known global cell type relationship.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
257
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
258 IDEAS takes as input a list of epigenetic data sets (histones, chromatin accessibility, CpG methylation, TFs, etc)
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
259 or any other whole-genome data sets (e.g., scores). Currently the supported data formats include BigWig and BAM.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
260 All data sets will first be mapped by IDEAS to a common genomic coordinate in a selected assembly (200bp windows
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
261 by default, or user-provided). The user can specify regions to be considered or removed from the analysis. The
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
262 input data may come from one cell type/condition/individual/time point (although it does not fully utilize the
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
263 advantage of IDEAS), or from multiple cell types/conditions/individuals/time points. The same set of epigenetic
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
264 features may not be present in all cell types, for which IDEAS will do imputation of the missing tracks if
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
265 specified.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
266
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
267 .. image:: $PATH_TO_IMAGES/ideas.png
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
268
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
269 IDEAS predicts regulatory functions, denoted by epigenetic states, at each position in each cell type by
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
270 **combining information simultaneously learned from other cell types** at the same positions in cell types with
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
271 similar local epigenetic landscapes. Size of genomic intervals for determining the similarity are also learned.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
272 All of the inferences are done through parallel infinite-state hidden Markov models (iHMM), which is a Bayesian
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
273 non-parametric technique to automatically determine the number of local cell type clusters and the number of
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
274 epigenetic states.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
275
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
276 In addition to its improved power, IDEAS has two unique advantages:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
277
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
278 1) applies **linear time inference** with respect to the number of cell types, which allows it to study hundreds or more cell types jointly
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
279 2) uses mini-batch training to **improve reproducibility** of the predicted epigenetic states, which is important because genome segmentation is not convex and hence cannot guarantee a global optimal solution.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
280
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
281 -----
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
282
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
283 **Options**
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
284
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
285 * **Set cell type and epigenetic factor names by** - cell type and epigenetic factor names can be set manually or by extracting them from the names of the selected input datasets. The latter case requires all selected datasets to have names that contain a "-" character.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
286
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
287 * **BAM or BigWig files** - select one or more Bam or Bigwig files from your history, making sure that the name of every selected input include a "-" character (e.g., e001-h3k4me3.bigwig).
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
288 * **Cell type, Epigenetic factor and Input** - manually select any number of inputs, setting the cell type and epigenetic factor name for each. The combination of "cell type name" and "epigenetic factor name" must be unique for each input. For example, if you have replicate data you may want to specify the cell name as "rep1", "rep2", etc and the factor name as "rep1", "rep2", etc.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
289
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
290 * **Cell type name** - cell type name
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
291 * **Epigenetic factor name** - epigenetic factor name
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
292 * **BAM or BigWig file** - BAM or BigWig file
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
293
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
294 * **Project name** - datasets produced by IDEAS will have this base name.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
295 * **Seed for IDEAS model initialization** - enter an integer to be used as the seed for the IDEAS model initialization. A zero value causes IDEAS to automatically generate a random seed, and this seed will be different for each job run.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
296 * **Select Bed file that defines genomic windows on which to process the data** - if "No" is selected, IDEAS will run whole genome segmentation. If "Yes" is selected, IDEAS will segment genomes in the unit of the windows defined by the bed file. This file can be in BED3, BED4 or BED5 format, but only the first three columns (chr posst posed) will be used.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
297
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
298 * **Window size in base pairs** - Window size in base pairs (if "No" is selected)
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
299 * **Restrict processing to specified chromosomes** - If "Yes" is selected, processing will be restricted to specified chromosomes
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
300
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
301 * **Chromosomes** - processing will be restricted to specified chromosomes (if "Yes" is selected)
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
302
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
303 * **Bed file specifying the genomic windows** - bed file specifying the genomic windows (if "Yes" is selected)
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
304
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
305 * **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
306 * **Calculate the signal in each genomic window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate the signal (i.e., the number of reads per bp) in each genomic window.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
307 * **Select file(s) containing regions to exclude** - select one or more bed files that contains regions you'd like excluded from your datasets.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
308 * **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
309
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
310 * **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
311 * **Use log2(x+number) transformation** - perform Log2-transformation of the input data by log2(x+number) (recommended for read count data to reduce skewness). You can enter a number that is representative of the noise level in your data (e.g., a number less than 1). If this number is at a similar scale or larger than the signal in your data, it will lose power. For example, if your input data is mean read count per window, using 0.1 may produce better results.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
312 * **Maximum number of states to be inferred** - restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
313 * **Initial number of states** - while IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
314 * **Maximum number of position classes to be inferred** - Set this value only if:
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
315
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
316 * you do not want position classes (e.g., for testing purposes), in this case set the value to 1
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
317 * IDEAS runs slow because there are too many position classes, generally less than 100 position classes will run fine
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
318
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
319 * **Maximum number of cell type clusters allowed** - If you set the value to 1, then all cell types will be clustered in one group, which may be desirable if all cell types are homogeneous and you want IDEAS to use information in all cell types equally.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
320 * **Prior concentration** - specify the prior concentration parameter; default is A=sqrt(number of cell types). A smaller concentration parameter (e.g., 1 or less) will emphasize more on position specificity and a larger concentration parameter (e.g., 10 * number of cell types) will emphasize more on global homogeneity.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
321 * **Number of burnin steps** - specify the number of burnin steps; default is 20. Increasing the burnin and maximization steps will increase computing and only slightly increase accuracy, while decreasing them will reduce computing resources but may also reduce accuracy. We recommend to run IDEAS with at least 20 burnins and 20 maximizations. IDEAS will not stop even if it reaches a maximum mode.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
322 * **Number of maximization steps** - specify the number of maximization steps; default is 20.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
323 * **Minimum standard deviation for the emission Gaussian distribution** - This number multiplied by the overall standard deviation of your data will be used as a lower bound for the standard deviation for each factor in each epigenetic state (the default is 0.5). This number is useful for removing very subtle clusters in the data. Setting this value near 0 will allow IDEAS to discover many subtle states, while setting it greater than 1 will result in IDEAS losing the ability to detect meaningful states.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
324 * **Maximim standard deviation for the emission Gaussian distribution** - if you want to find fine-grained states you may use this option (if not used, IDEAS uses infinity), but it is rearely used unless you need more states to be inferred.
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
325
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
326 </help>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
327 <citations>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
328 <citation type="doi">10.1093/nar/gkw278</citation>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
329 </citations>
3e214a2fcec9 Uploaded
greg
parents:
diff changeset
330 </tool>