Mercurial > repos > greg > ideas
comparison ideas.xml @ 180:15bd502e6a0c draft
Uploaded
| author | greg |
|---|---|
| date | Thu, 01 Feb 2018 12:54:06 -0500 |
| parents | 12168e1e4859 |
| children | 28a995056cd0 |
comparison
equal
deleted
inserted
replaced
| 179:f32f15562a82 | 180:15bd502e6a0c |
|---|---|
| 11 </requirements> | 11 </requirements> |
| 12 <command detect_errors="exit_code"><![CDATA[ | 12 <command detect_errors="exit_code"><![CDATA[ |
| 13 #import os | 13 #import os |
| 14 #set perform_training = $perform_training_cond.perform_training | 14 #set perform_training = $perform_training_cond.perform_training |
| 15 | 15 |
| 16 ## Set the value of chrom_bed_input since we don't symlink it. | 16 ## Copy the input's compressed tmp directory archive. |
| 17 #if $input.metadata.chrom_bed is None: | 17 cp $input.metadata.tmp_archive . && |
| 18 #set chrom_bed_input = None | 18 ## Extract the tmp archive. |
| 19 #else: | 19 tar -xzf tmp.tar.gz && |
| 20 #set chrom_bed_input = $os.path.join($input.extra_files_path, $input.metadata.chrom_bed) | |
| 21 #end if | |
| 22 | |
| 23 ## Link chromosome_windows.txt if not None. | |
| 24 #set chromosome_windows = $input.metadata.chrom_windows | |
| 25 #if chromosome_windows is not None: | |
| 26 #set from_path = $os.path.join($input.extra_files_path, $chromosome_windows) | |
| 27 ln -s $from_path . && | |
| 28 #end if | |
| 29 | |
| 30 ## Link IDEAS_input_config.txt which will always exist. | |
| 31 #set ideas_input_config = $input.metadata.input_config | |
| 32 #set from_path = $os.path.join($input.extra_files_path, $ideas_input_config) | |
| 33 ln -s $from_path . && | |
| 34 | |
| 35 ## Link all files in the extra_files_path's tmp directory. | |
| 36 #set tmp_dir = "tmp" | |
| 37 #set from_path = $os.path.join($input.extra_files_path, "tmp") | |
| 38 mkdir $tmp_dir && | |
| 39 #for f in $os.listdir($from_path): | |
| 40 ln -s $os.path.join($from_path, $f) $tmp_dir && | |
| 41 #end for | |
| 42 | 20 |
| 43 ## Define and create output directories. | 21 ## Define and create output directories. |
| 44 #set output_pdf_dir = "output_pdf_dir" | 22 #set output_pdf_dir = "output_pdf_dir" |
| 45 #set output_txt_dir = "output_txt_dir" | 23 #set output_txt_dir = "output_txt_dir" |
| 46 #set output_training_dir = "output_training_dir" | 24 #set output_training_dir = "output_training_dir" |
| 58 Rscript '$__tool_directory__/ideas.R' | 36 Rscript '$__tool_directory__/ideas.R' |
| 59 --burnin_num $burnin_num | 37 --burnin_num $burnin_num |
| 60 #if str($bychr) == "true": | 38 #if str($bychr) == "true": |
| 61 --bychr true | 39 --bychr true |
| 62 #end if | 40 #end if |
| 63 --chrom_bed_input $chrom_bed_input | 41 #if $input.metadata.chrom_bed is not None: |
| 64 --chromosome_windows $chromosome_windows | 42 --chrom_bed_input $input.metadata.chrom_bed |
| 43 #end if | |
| 44 #if $input.metadata.chrom_windows is not None: | |
| 45 --chromosome_windows $input.metadata.chrom_windows | |
| 46 #end if | |
| 65 #if str($hp) == "true": | 47 #if str($hp) == "true": |
| 66 --hp true | 48 --hp true |
| 67 #end if | 49 #end if |
| 68 #if str($initial_states) != "0": | 50 #if str($initial_states) != "0": |
| 69 --initial_states $initial_states | 51 --initial_states $initial_states |
| 70 #end if | 52 #end if |
| 71 --input_files_path $input.extra_files_path | 53 --ideas_input_config $input.metadata.input_config |
| 72 --ideas_input_config $ideas_input_config | |
| 73 #if str($log2) != "0.0": | 54 #if str($log2) != "0.0": |
| 74 --log2 $log2 | 55 --log2 $log2 |
| 75 #end if | 56 #end if |
| 76 #if str($maxerr) != "0.0": | 57 #if str($maxerr) != "0.0": |
| 77 --maxerr $maxerr | 58 --maxerr $maxerr |
| 142 <param name="project_name" type="text" value="myProject" label="Project name" help="Outputs will have this base name"> | 123 <param name="project_name" type="text" value="myProject" label="Project name" help="Outputs will have this base name"> |
| 143 <validator type="empty_field"/> | 124 <validator type="empty_field"/> |
| 144 </param> | 125 </param> |
| 145 <param name="rseed" type="integer" value="1234" min="0" max="1000000" label="Seed for IDEAS model initialization" help="Zero value generates a random seed, and this seed will be different for each job run."/> | 126 <param name="rseed" type="integer" value="1234" min="0" max="1000000" label="Seed for IDEAS model initialization" help="Zero value generates a random seed, and this seed will be different for each job run."/> |
| 146 <param name="bychr" type="boolean" truevalue="true" falsevalue="" checked="False" label="Output chromosomes in separate files"/> | 127 <param name="bychr" type="boolean" truevalue="true" falsevalue="" checked="False" label="Output chromosomes in separate files"/> |
| 147 <param name="reads_per_bp" type="select" display="radio" label="Calculate the signal in each genomic window using"> | 128 <param name="reads_per_bp" type="select" display="radio" label="Calculate the signal in each window using"> |
| 148 <option value="6" selected="true">mean</option> | 129 <option value="6" selected="true">mean</option> |
| 149 <option value="8">max</option> | 130 <option value="8">max</option> |
| 150 </param> | 131 </param> |
| 151 <param name="hp" type="boolean" truevalue="true" falsevalue="" checked="False" label="Discourage state transition across chromosomes"/> | 132 <param name="hp" type="boolean" truevalue="true" falsevalue="" checked="False" label="Discourage state transition across chromosomes"/> |
| 152 <param name="log2" type="float" value="0" min="0" label="Use log2(x+number) transformation" help="Zero means no log2 transformation"/> | 133 <param name="log2" type="float" value="0" min="0" label="Use log2(x+number) transformation" help="Zero means no log2 transformation"/> |
| 186 <filter>perform_training_cond['perform_training'] == 'yes'</filter> | 167 <filter>perform_training_cond['perform_training'] == 'yes'</filter> |
| 187 </collection> | 168 </collection> |
| 188 </outputs> | 169 </outputs> |
| 189 <tests> | 170 <tests> |
| 190 <test> | 171 <test> |
| 191 <param name="perform_training" value="no"/> | 172 <param name="perform_training" value="yes"/> |
| 192 <param name="input" value="input.ideaspre" ftype="ideaspre" dbkey="hg19"/> | 173 <param name="training_iterations" value="3"/> |
| 174 <param name="input" value="ideas_test1/input.html" dbkey="hg19" ftype="ideaspre"> | |
| 175 <composite_data value='ideas_test1/chromosomes.bed' dbkey="hg19"/> | |
| 176 <composite_data value='ideas_test1/chromosome_windows.txt' dbkey="hg19"/> | |
| 177 <composite_data value='ideas_test1/IDEAS_input_config.txt' dbkey="hg19"/> | |
| 178 <composite_data value='ideas_test1/tmp.tar.gz' dbkey="hg19"/> | |
| 179 <param name="output_heatmaps" value="yes"/> | |
| 180 </param> | |
| 193 <param name="project_name" value="IDEAS_out"/> | 181 <param name="project_name" value="IDEAS_out"/> |
| 194 <param name="initial_states" value="2"/> | |
| 195 <param name="maxerr" value="1000"/> | |
| 196 <param name="output_heatmaps" value="no"/> | |
| 197 <param name="save_ideas_log" value="yes"/> | 182 <param name="save_ideas_log" value="yes"/> |
| 198 <output_collection name="output_txt_collection" type="list"> | 183 <output_collection name="output_training_collection" type="list"> |
| 199 <element name="IDEAS_out.chr1.cluster" file="IDEAS_out.cluster" ftype="txt"/> | 184 <element name="IDEAS_out.chr1.cluster" file="IDEAS_out.cluster" ftype="txt"/> |
| 200 <element name="IDEAS_out.chr1.para" file="IDEAS_out.para" ftype="txt"/> | 185 <element name="IDEAS_out.chr2.cluster" file="IDEAS_out.cluster" ftype="txt"/> |
| 201 <element name="IDEAS_out.chr1.profile" file="IDEAS_out.profile" ftype="txt"/> | 186 <element name="IDEAS_out.chr3.cluster" file="IDEAS_out.cluster" ftype="txt"/> |
| 202 <element name="IDEAS_out.chr1.state" file="IDEAS_out.state" ftype="txt"/> | 187 <element name="IDEAS_out.chr4.cluster" file="IDEAS_out.cluster" ftype="txt"/> |
| 188 <element name="IDEAS_out.chr5.cluster" file="IDEAS_out.cluster" ftype="txt"/> | |
| 189 <element name="IDEAS_out.chr1.state" file="IDEAS_out.chr1.state" ftype="txt"/> | |
| 190 <element name="IDEAS_out.chr2.state" file="IDEAS_out.chr2.state" ftype="txt"/> | |
| 191 <element name="IDEAS_out.chr3.state" file="IDEAS_out.chr3.state" ftype="txt"/> | |
| 192 <element name="IDEAS_out.chr4.state" file="IDEAS_out.chr4.state" ftype="txt"/> | |
| 193 <element name="IDEAS_out.chr5.state" file="IDEAS_out.chr5.state" ftype="txt"/> | |
| 194 <element name="IDEAS_out.para0" file="IDEAS_out.para0" ftype="txt"/> | |
| 195 <element name="IDEAS_out.profile0" file="IDEAS_out.profile0" ftype="txt"/> | |
| 196 </output_collection> | |
| 197 <output_collection name="output_pdf_collection" type="list"> | |
| 198 <element name="IDEAS_out.state.1.pdf" file="IDEAS_out.state.1.pdf" ftype="pdf"/> | |
| 203 </output_collection> | 199 </output_collection> |
| 204 <output name="output_log" file="output_log.txt" ftype="txt" compare="contains" /> | 200 <output name="output_log" file="output_log.txt" ftype="txt" compare="contains" /> |
| 205 </test> | 201 </test> |
| 206 </tests> | 202 </tests> |
| 207 <help> | 203 <help> |
| 210 IDEAS (an **I**\ ntegrative and **D**\ iscriminative **E**\ pigenome **A**\ nnotation **S**\ ystem) identifies | 206 IDEAS (an **I**\ ntegrative and **D**\ iscriminative **E**\ pigenome **A**\ nnotation **S**\ ystem) identifies |
| 211 de novo regulatory functions from epigenetic data in multiple cell types jointly. It is a full probabilistic | 207 de novo regulatory functions from epigenetic data in multiple cell types jointly. It is a full probabilistic |
| 212 model defined on all data, and it combines signals across both the genome and cell types to boost power. The | 208 model defined on all data, and it combines signals across both the genome and cell types to boost power. The |
| 213 underlying assumption of IDEAS is that, because all cell types share the same underlying DNA sequences, | 209 underlying assumption of IDEAS is that, because all cell types share the same underlying DNA sequences, |
| 214 **functions of each DNA segment should be correlated**. Also, cell type specific regulation is locus-dependent, | 210 **functions of each DNA segment should be correlated**. Also, cell type specific regulation is locus-dependent, |
| 215 and thus IDEAS uses local epigenetic landscape to **identify de novo and local cell type clusters** without | 211 and so IDEAS uses local epigenetic landscape to **identify de novo and local cell type clusters** without |
| 216 assuming or requiring a known global cell type relationship. | 212 assuming or requiring a known global cell type relationship. |
| 217 | 213 |
| 218 IDEAS takes as input a list of epigenetic data sets (histones, chromatin accessibility, CpG methylation, TFs, etc) | 214 The input is a single dataset with the **IdeasPre** datatype, which is produced by the IDEAS Preprocessor tool. |
| 219 or any other whole-genome data sets (e.g., scores). Currently the supported data formats include BigWig and BAM. | |
| 220 All data sets will first be mapped by IDEAS to a common genomic coordinate in a selected assembly (200bp windows | |
| 221 by default, or user-provided). The user can specify regions to be considered or removed from the analysis. The | |
| 222 input data may come from one cell type/condition/individual/time point (although it does not fully utilize the | |
| 223 advantage of IDEAS), or from multiple cell types/conditions/individuals/time points. The same set of epigenetic | |
| 224 features may not be present in all cell types, for which IDEAS will do imputation of the missing tracks if | |
| 225 specified. | |
| 226 | 215 |
| 227 .. image:: $PATH_TO_IMAGES/ideas.png | 216 .. image:: $PATH_TO_IMAGES/ideas.png |
| 228 | 217 |
| 229 IDEAS predicts regulatory functions, denoted by epigenetic states, at each position in each cell type by | 218 IDEAS predicts regulatory functions, denoted by epigenetic states, at each position in each cell type by |
| 230 **combining information simultaneously learned from other cell types** at the same positions in cell types with | 219 **combining information simultaneously learned from other cell types** at the same positions in cell types with |
| 256 * **Epigenetic factor name** - epigenetic factor name | 245 * **Epigenetic factor name** - epigenetic factor name |
| 257 * **BAM or BigWig file** - BAM or BigWig file | 246 * **BAM or BigWig file** - BAM or BigWig file |
| 258 | 247 |
| 259 * **Project name** - datasets produced by IDEAS will have this base name. | 248 * **Project name** - datasets produced by IDEAS will have this base name. |
| 260 * **Seed for IDEAS model initialization** - enter an integer to be used as the seed for the IDEAS model initialization. A zero value causes IDEAS to automatically generate a random seed, and this seed will be different for each job run. | 249 * **Seed for IDEAS model initialization** - enter an integer to be used as the seed for the IDEAS model initialization. A zero value causes IDEAS to automatically generate a random seed, and this seed will be different for each job run. |
| 261 * **Select Bed file that defines genomic windows on which to process the data** - if "No" is selected, IDEAS will run whole genome segmentation. If "Yes" is selected, IDEAS will segment genomes in the unit of the windows defined by the bed file. This file can be in BED3, BED4 or BED5 format, but only the first three columns (chr posst posed) will be used. | |
| 262 | |
| 263 * **Window size in base pairs** - Window size in base pairs (if "No" is selected) | |
| 264 * **Restrict processing to specified chromosomes** - If "Yes" is selected, processing will be restricted to specified chromosomes | |
| 265 | |
| 266 * **Chromosomes** - processing will be restricted to specified chromosomes (if "Yes" is selected) | |
| 267 | |
| 268 * **Bed file specifying the genomic windows** - bed file specifying the genomic windows (if "Yes" is selected) | |
| 269 | |
| 270 * **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately. | 250 * **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately. |
| 271 * **Calculate the signal in each genomic window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate the signal (i.e., the number of reads per bp) in each genomic window. | 251 * **Calculate the signal in each window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate the signal (i.e., the number of reads per bp) in each window. |
| 272 * **Select file(s) containing regions to exclude** - select one or more bed files that contains regions you'd like excluded from your datasets. | |
| 273 * **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change. | 252 * **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change. |
| 274 * **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision. | 253 * **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision. |
| 275 * **Use log2(x+number) transformation** - perform Log2-transformation of the input data by log2(x+number) (recommended for read count data to reduce skewness). You can enter a number that is representative of the noise level in your data (e.g., a number less than 1). If this number is at a similar scale or larger than the signal in your data, it will lose power. For example, if your input data is mean read count per window, using 0.1 may produce better results. | 254 * **Use log2(x+number) transformation** - perform Log2-transformation of the input data by log2(x+number) (recommended for read count data to reduce skewness). You can enter a number that is representative of the noise level in your data (e.g., a number less than 1). If this number is at a similar scale or larger than the signal in your data, it will lose power. For example, if your input data is mean read count per window, using 0.1 may produce better results. |
| 276 * **Maximum number of states to be inferred** - restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified | 255 * **Maximum number of states to be inferred** - restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified |
| 277 * **Initial number of states** - while IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect. | 256 * **Initial number of states** - while IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect. |
