view ideas.xml @ 78:949de45a7269 draft

Uploaded
author greg
date Thu, 24 Aug 2017 09:10:01 -0400
parents 670d811d3244
children 20d4dc694105
line wrap: on
line source

<tool id="ideas" name="IDEAS" version="1.2.0">
    <description>accounts for position dependent epigenetic events and detects local cell type relationships</description>
    <requirements>
        <requirement type="package" version="2.26.0">bedtools</requirement>
        <requirement type="package" version="332">ucsc-bedgraphtobigwig</requirement>
        <requirement type="package" version="332">ucsc-bedsort</requirement>
        <requirement type="package" version="332">ucsc-bigwigaverageoverbed</requirement>
        <requirement type="package" version="1.2.0">ideas</requirement>
        <requirement type="package" version="1.3.2">r-optparse</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
#set tmp_dir = "tmp"
#set prep_input_config = "prep_input_config.txt"
#set prep_output_config = "prep_output_config.txt"
##############################################
## Create the config file and prepare the data
##############################################
#set input_type = $input_type_cond.input_type
#if str($input_type) == "datasets":
    #set cell_type_epigenetic_factor_cond = $input_type_cond.cell_type_epigenetic_factor_cond
    #set cell_type_epigenetic_factor = $cell_type_epigenetic_factor_cond.cell_type_epigenetic_factor
    #set specify_genomic_window_cond = $input_type_cond.specify_genomic_window_cond
    #set specify_genomic_window = $specify_genomic_window_cond.specify_genomic_window
    cp '$gen_prep_input_config' $prep_input_config &&
    prepMat
    $prep_input_config
    #if str($specify_genomic_window) == "yes":
        -bed '$specify_genomic_window_cond.bed_input'
    #else:
        -gsz '$chromInfo'
        -wsz $specify_genomic_window_cond.window_size
        #set restrict_chromosomes = $specify_genomic_window_cond.restrict_chromosomes_cond.restrict_chromosomes
        #if str($restrict_chromosomes) == "yes":
            #set chroms = []
            #set chrom_repeat = $specify_genomic_window_cond.restrict_chromosomes_cond.chrom_repeat
            #for $i in $chrom_repeat.chrom
                $chroms.append($i)
            #end for
            -chr ",".join(chroms)
        #end if
    #end if
#end if
$bychr
-c $reads_per_bp
#if str($blacklist_input) not in ["None", ""]:
    -exclude '$blacklist_input'
#end if
$norm
##############################################
## Coerce the prepMat config output to the
## format expected by the R matrix builder.
##############################################
&& cut -d' ' $prep_input_config -f1,2 > file1.txt
&& ls tmp/*.bed.gz > file2.txt
&& paste <(cat file1.txt) <(cat file2.txt) > $prep_output_config
##############################################
## Build the R matrix from the prepMat output
##############################################
##&& Rscript '$__tool_directory__/build_matrix.R'
##-i $tmp_dir/*.bed.gz
##-o $ideas_matrix_input_file
##-w $ideas_input_dir
##############################################
## Run IDEAS
##############################################
&& ideas
'$prep_output_config'
#if str($input_type) == "datasets":
    #set specify_genomic_window_cond = $input_type_cond.specify_genomic_window_cond
    #set specify_genomic_window = $specify_genomic_window_cond.specify_genomic_window
    #if str($specify_genomic_window) == "yes":
        '$specify_genomic_window_cond.bed_input'
    #else:
        $tmp_dir/*.bed
    #end if
#else:
    $tmp_dir/*.bed
#end if
$hp
#if str($log2) != "0.0":
    -log2 $log2
#end if
#if str($max_states) != "0.0":
    -G $max_states
#end if
#if str($initial_states) != "0":
    -C $initial_states
#end if
#if str($max_position_classes) != "0":
    -P $max_position_classes
#end if
#if str($max_cell_type_clusters) != "0":
    -K $max_cell_type_clusters
#end if
#if str($prior_concentration) != "0.0":
    -A $prior_concentration
#end if
-sample $burnin_num $mcmc_num
#if str($minerr) != "0.0":
    -minerr $minerr
#end if
#if str($maxerr) != "0.0":
    -maxerr $maxerr
#end if
-thread \${GALAXY_SLOTS:-4}
> $output_log
&& mv ./*.cluster $output_cluster
&& mv ./*.para $output_para
&& mv ./*.profile $output_profile
&& mv ./*.state $output_state
    ]]></command>
    <configfiles>
        <configfile name="gen_prep_input_config"><![CDATA[#if str($input_type_cond.input_type) == "datasets":
    #if str($input_type_cond.cell_type_epigenetic_factor_cond.cell_type_epigenetic_factor) == "manual":
        #for $input_items in $input_type_cond.cell_type_epigenetic_factor_cond.input_repeat:
    ${input_items.cell_type_name} ${input_items.epigenetic_factor_name} ${input_items.input}
        #end for
    #else if str($input_type_cond.cell_type_epigenetic_factor_cond.cell_type_epigenetic_factor) == "extract":
        #set $cell_type_epigenetic_factor_cond = $input_type_cond.cell_type_epigenetic_factor_cond
        #set $input_name_positions = $cell_type_epigenetic_factor_cond.input_name_positions
        #for $i in $cell_type_epigenetic_factor_cond.input:
            #set $file_name_with_ext = $os.path.basename($i)
            #set $file_name = $file_name_with_ext.split(".")[0]
            #if $input_name_positions == "cell_first":
                #set $cell_type_name = $file_name.split("-")[0]
                #set $epigenetic_factor_name = $file_name.split("-")[1]
            #else:
                #set $cell_type_name = $file_name.split("-")[1]
                #set $epigenetic_factor_name = $file_name.split("-")[0]
            #end if
    ${cell_type_name} ${epigenetic_factor_name} ${i}
        #end for
    #end if
#end if]]></configfile>
    </configfiles>
    <inputs>
        <conditional name="input_type_cond">
            <param name="input_type" type="select" label="Select input type">
                <option value="datasets" selected="true">Bam, BigWig files</option>
                <option value="data_matrix">Data matrix</option>
            </param>
            <when value="datasets">
                <conditional name="cell_type_epigenetic_factor_cond">
                    <param name="cell_type_epigenetic_factor" type="select" label="Set cell type and epigenetic factor names by">
                        <option value="extract" selected="true">extracting them from the selected input file names</option>
                        <option value="manual">manually setting them for each selected input</option>
                    </param>
                    <when value="extract">
                        <param name="input" type="data" format="bigwig,bam" multiple="True" label="BAM or BigWig file">
                            <validator type="empty_field"/>
                            <validator type="unspecified_build"/>
                        </param>
                        <param name="input_name_positions" type="select" display="radio" label="Selected input file name pattern is" help="A '-' character must separate cell type and epigenetic factor names within the selected input file names">
                            <option value="cell_first" selected="true">Cell type name - Epigenetic factor name</option>
                            <option value="cell_last">Epigenetic factor name - Cell type name</option>
                        </param>
                    </when>
                    <when value="manual">
                        <repeat name="input_repeat" title="Cell type, Epigenetic factor and Input" min="1">
                            <param name="cell_type_name" type="text" value="" label="Cell type name">
                                <validator type="empty_field"/>
                            </param>
                            <param name="epigenetic_factor_name" type="text" value="" label="Epigenetic factor name">
                                <validator type="empty_field"/>
                            </param>
                            <param name="input" type="data" format="bigwig,bam" label="BAM or BigWig file">
                                <validator type="empty_field"/>
                                <validator type="unspecified_build"/>
                            </param>
                        </repeat>
                    </when>
                </conditional>
                <conditional name="specify_genomic_window_cond">
                    <param name="specify_genomic_window" type="select" label="Select Bed file that defines genomic windows on which to process the data">
                        <option value="no" selected="true">No</option>
                        <option value="yes">Yes</option>
                    </param>
                    <when value="no">
                        <param name="window_size" type="integer" value="200" label="Window size in base pairs"/>
                        <conditional name="restrict_chromosomes_cond">
                            <param name="restrict_chromosomes" type="select" label="Restrict processing to specified chromosomes">
                                <option value="no" selected="true">No</option>
                                <option value="yes">Yes</option>
                            </param>
                            <when value="no"/>
                            <when value="yes">
                                <repeat name="chrom_repeat" title="Chromosomes" min="1">
                                    <param name="chrom" type="text" value="" label="Chromosome"/>
                                </repeat>
                            </when>
                        </conditional>
                    </when>
                    <when value="yes">
                        <param name="bed_input" type="data" format="bed" label="Bed file specifying the genomic windows"/>
                    </when>
                </conditional>
            </when>
            <when value="data_matrix"/>
        </conditional>
        <param argument="-bychr" type="boolean" truevalue="-bychr" falsevalue="" checked="False" label="Output chromosomes in separate files"/>
        <param name="reads_per_bp" type="select" display="radio" label="Calculate the average signal in each genomic window using">
            <option value="6" selected="true">mean</option>
            <option value="8">max</option>
        </param>
        <param name="blacklist_input" type="data" format="bed" optional="True" multiple="True" label="Select file(s) containing regions to exclude"/>
        <param argument="-norm" type="boolean" truevalue="-norm" falsevalue="" checked="False" label="Standardize all datasets"/>
        <param argument="-hp" type="boolean" truevalue="-hp" falsevalue="" checked="False" label="Discourage state transition across chromosomes"/>
        <param name="log2" type="float" value="0" min="0" label="Use log2(x+number) transformation" help="Zero value has no affect"/>
        <param name="max_states" type="float" value="0" min="0" label="Maximum number of states to be inferred" help="Zero value has no affect"/>
        <param name="initial_states" type="integer" value="20" min="0" label="Initial number of states" help="Zero value has no affect"/>
        <param name="max_position_classes" type="integer" value="0" min="0" label="Maximum number of position classes to be inferred" help="Zero value has no affect"/>
        <param name="max_cell_type_clusters" type="integer" value="0" min="0" label="Maximum number of cell type clusters allowed" help="Zero value has no affect"/>
        <param name="prior_concentration" type="float" value="1" min="0" label="Prior concentration" help="Zero value results in the default value: sqrt(number of cell types)"/>
        <param name="burnin_num" type="integer" value="20" min="1" label="Number of burnin steps"/>
        <param name="mcmc_num" type="integer" value="20" min="1" label="Number of maximization steps"/>
        <param name="minerr" type="float" value="0.5" min="0" label="Minimum standard deviation for the emission Gaussian distribution" help="Zero value results in the default value: 0.5"/>
        <param name="maxerr" type="float" value="1000000" min="0" label="Maximum standard deviation for the emission Gaussian distribution" help="Zero value results in the default value: 1000000"/>
    </inputs>
    <outputs>
        <data name="output_log" format="txt" label="${tool.name} (ideas output log) on ${on_string}"/>
        <data name="output_cluster" format="txt" label="${tool.name} (local cell type clustering) on ${on_string}"/>
        <data name="output_para" format="tabular" label="${tool.name} (epigenetic state frequency, mean and variance parameters) on ${on_string}"/>
        <data name="output_profile" format="txt" label="${tool.name} (profile) on ${on_string}"/>
        <data name="output_state" format="txt" label="${tool.name} (epigenetic states and position classes) on ${on_string}"/>
    </outputs>
    <tests>
    </tests>
    <help>
**What it does**

Employs the IDEAS (Integrative and Discriminative Epigenome Annotation System) method for jointly and quantitatively characterizing
multivariate epigenetic landscapes in many cell types, tissues or conditions. The method accounts for position dependent epigenetic
events and detects local cell type relationships, which not only help to improve the accuracy of annotating functional classes of DNA
sequences, but also reveal cell type constitutive and specific loci. The method utilizes Bayesian non-parametric techniques to automatically
identify the best model size fitting to the data so users do not have to specify the number of states. On the other hand, users can
still specify the number of states if desired.

-----

**Required options**

 * **Cell type, Epigenetic factor and Input** - specify any number of inputs with currently supported formats, either bam or bigwig.  The cell name + factor name must be unique for each input.  For example, if you have replicate data you may want to specify the cell name as "cell_rep1", "cell_rep2", etc and the factor name as "factor_rep1", "factor_rep2", etc.
 
    * **Cell type name** - cell type name
    * **Epigenetic factor name** - epigenetic factor name
    * **BAM or BigWig file** - BAM or BigWig file

 * **Set genomic windows on which to process the data** - if "No" is selected, IDEAS will run whole genome segmentation.  If "Yes" is selected, IDEAS will segment genomes in the unit of the windows defined by the bed file.  This file can be in BED3, BED4 or BED5 format, but only the first three columns (chr posst posed) will be used.

    * **Window size in base pairs** - Window size in base pairs (if "No" is selected)
    * **Restrict processing to specified chromosomes** - If "Yes" is selected, processing will be restricted to specified chromosomes

       * **Chromosomes** - processing will be restricted to specified chromosomes (if "Yes" is selected)

         * **Chromosome** - specified chromosome

    * **Bed file specifying the genomic windows** - bed file specifying the genomic windows (if "Yes" is selected)

**Other options**

* **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately.
* **Calculate the average signal in each genomic window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate average signal (number of reads per bp) in each genomic window.
* **Select file(s) containing regions to exclude** - select one or more bed files that contains regions you'd like excluded from your datasets.
* **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change.

* **Discourage state transition across chromosomes** - select "Yes" to produce similar states in adjacent windows, making the annotation smoother, but at risk of reducing precision.
* **Use log2(x+number) transformation** - perform Log2-transformation of the input data by log2(x+number) (recommended for read count data to reduce skewness). You can enter a number less than 1. For example, if your input data is mean read count per window, using 0.1 may produce better results.
* **Maximum number of states to be inferred** - restrict the maximum number of states to be generated by IDEAS; the final number of inferred states may be smaller than the number you specified
* **Initial number of states** - while IDEAS may infer 30 states or more by starting from just 20 states, it may not do so if it is trapped in a local mode. We recommend setting the initial number of states slightly larger than the number of states you expect.
* **Maximum number of position classes to be inferred** - Set this value only if:

   * you do not want position classes (e.g., for testing purposes), in this case set the value to 1
   * IDEAS runs slow because there are too many position classes, generally less than 100 position classes will run fine

* **Maximum number of cell type clusters allowed** - Set this value only for testing.  If you set the value to 1, then all cell types will be clustered in one group.
* **Prior concentration** - specify the prior concentration parameter; default is A=sqrt(number of cell types).  A smaller concentration parameter (e.g., 1 or less) will emphasize more on position specificity and a larger concentration parameter (e.g., 10 * number of cell types) will emphasize more on global homogeneity.
* **Set the the number of burnin and maximization steps** - specify the number of burnin and maximization steps; default it is 50 50.  Increasing these two numbers will increase computing and only slightly increase accuracy.  Decreasing these two numbers will reduce computing but may also reduce accuracy.  We recommend to run IDEAS with at least 20 burnins and 20 maximizations.  IDEAS will not stop even if it reaches a maximum mode.
* **Minimum standard deviation for the emission Gaussian distribution** - you should change the default value of 0.5 if the standard deviation of your data is much smaller or much larger than 1. The first line of the output produced by IDEAS is **ysd=xxx**, which is the total standard deviation of your data. If that value is less than 0.5, you may set the minimum standard deviation to an even smaller number (e.g., xxx/2). If the standard deviation of your data is much greater than 1, (e.g., 20), you may set the minimum standard deviation to a larger value, (e.g., 5). Modifying the minimum standard deviation in the former case is more necessary than in the latter case because otherwise you may end up finding no interesting segmentations. We do not recommend setting the minimum standard deviation to be 0 or smaller, as doing so may capture some artificial and uninteresting states due to tightly clustered data, such as 0 in read counts.
* **Maximim standard deviation for the emission Gaussian distribution** - if you want to find fine-grained states you may use this option (if not used, IDEAS uses infinity), but it is rearely used unless you need more states to be inferred. 

    </help>
    <citations>
        <citation type="doi">10.1093/nar/gkw278</citation>
    </citations>
</tool>