Mercurial > repos > greg > ideas_preprocessor

<tool id="ideas_preprocessor" name="IDEAS Preprocessor" version="1.0.0">
    <description></description>
    <requirements>
        <requirement type="package" version="2.5.4">deeptools</requirement>
        <requirement type="package" version="1.10.4">r-data.table</requirement>
        <requirement type="package" version="1.4.4">r-optparse</requirement>
        <requirement type="package" version="1.6">samtools</requirement>
        <requirement type="package" version="357">ucsc-bigwigaverageoverbed</requirement>
    </requirements>
    <command detect_errors="exit_code"><![CDATA[
#set chromosome_windows = "chromosome_windows.txt"
#set ideaspre_input_config = "ideaspre_input_config.txt"
#set specify_chrom_windows = $specify_chrom_windows_cond.specify_chrom_windows
#set tmp_dir = "tmp"
mkdir $tmp_dir &&
mkdir $output.files_path &&
#if str($specify_chrom_windows) == "yes":
    ##############################################
    ## Using a bed file for defining chrom windows,
    ## so categorize the window positions by
    ## chromosome to enable the IDEAS -inv option.
    ##############################################
    cp '$gen_chromosome_windows' $chromosome_windows &&
#end if
##############################################
## Create the config file and prepare the data
##############################################
cp '$gen_ideaspre_input_config' $ideaspre_input_config &&
sort $ideaspre_input_config -o $ideaspre_input_config &&
Rscript '$__tool_directory__/ideas_preprocessor.R'
--ideaspre_input_config '$ideaspre_input_config'
#if str($specify_chrom_windows) == "yes":
    --chromosome_windows '$chromosome_windows'
    --chrom_bed_input '$specify_chrom_windows_cond.chrom_bed_input'
#else:
    --chrom_len_file '$chromInfo'
    --window_size $specify_chrom_windows_cond.window_size
    #set restrict_chromosomes = $specify_chrom_windows_cond.restrict_chromosomes_cond.restrict_chromosomes
    #if str($restrict_chromosomes) == "yes":
        #set chroms = []
        #set chrom_repeat = $specify_chrom_windows_cond.restrict_chromosomes_cond.chrom_repeat
        #for $i in $chrom_repeat.chrom
            $chroms.append($i)
        #end for
        --restrict_to_chroms ",".join(chroms)
    #end if
#end if
#if str($bychr) == "true":
    --bychr true
#end if
--reads_per_bp $reads_per_bp
#if str($exclude_bed_input) not in ["None", ""]:
    --exclude_bed_input '$exclude_bed_input'
#end if
#if str($standardize_datasets) == "true":
    --standardize_datasets true
#end if
--output '$output'
--output_files_path '$output.files_path'
&> ideas_preprocessor_log.txt;
if [[ $? -ne 0 ]]; then
    cp ideas_preprocessor_log.txt '$output';
   exit 1;
fi
    ]]></command>
    <configfiles>
        <configfile name="gen_ideaspre_input_config"><![CDATA[#if str($cell_type_epigenetic_factor_cond.cell_type_epigenetic_factor) == "extract":
    #set input_name_positions = $cell_type_epigenetic_factor_cond.input_name_positions
    #for $i in $cell_type_epigenetic_factor_cond.input:
        #set file_name_with_ext = $i.name
        #if str($file_name_with_ext).find("http") >= 0 or str($file_name_with_ext).find("ftp") >= 0:
             #set file_name_with_ext = $file_name_with_ext.split('/')[-1]
        #end if
        #assert str($file_name_with_ext).find("-") >= 0, "The selected input '%s' is invalid because it does not include the '-' character which is required when setting cell type and epigenetic factor names by extracting them from the input file names." % $file_name_with_ext
        #set file_name = $file_name_with_ext.split(".")[0]
        #if str($input_name_positions) == "cell_first":
            #set cell_type_name = $file_name.split("-")[0]
            #set epigenetic_factor_name = $file_name.split("-")[1]
        #else:
            #set cell_type_name = $file_name.split("-")[1]
            #set epigenetic_factor_name = $file_name.split("-")[0]
        #end if
${cell_type_name} ${epigenetic_factor_name} ${i} ${file_name} ${i.ext}
    #end for
#else:
    #for $input_items in $cell_type_epigenetic_factor_cond.input_repeat:
${input_items.cell_type_name} ${input_items.epigenetic_factor_name} ${input_items.input} ${file_name} ${input_items.input.ext}
    #end for
#end if]]></configfile>
        <configfile name="gen_chromosome_windows"><![CDATA[#if str($specify_chrom_windows_cond.specify_chrom_windows) == "yes":
    #import collections
    #set window_positions_by_chroms_odict = $collections.OrderedDict()
    #for count, line in enumerate(open($specify_chrom_windows_cond.chrom_bed_input.file_name, 'r')):
        #set $line = $line.strip()
        #if not $line or $line.startswith('#'):
            #continue
        #end if
        #set items = $line.split('\t')
        #if $items[0] in $window_positions_by_chroms_odict:
            #set tup = $window_positions_by_chroms_odict[$items[0]]
            #set $tup[1] += 1
            #set $window_positions_by_chroms_odict[$items[0]] = $tup
        #else:
            #set $window_positions_by_chroms_odict[$items[0]] = [$count, $count+1]
        #end if
    #end for
    #for $chrom, $tup in $window_positions_by_chroms_odict.items():
${chrom} ${tup[0]} ${tup[1]}
    #end for
#end if]]></configfile>
    </configfiles>
    <inputs>
        <conditional name="cell_type_epigenetic_factor_cond">
            <param name="cell_type_epigenetic_factor" type="select" label="Set cell type and epigenetic factor names by">
                <option value="extract" selected="true">extracting them from the selected input file names</option>
                <option value="manual">manually setting them for each selected input</option>
            </param>
            <when value="extract">
                <param name="input" type="data" format="bigwig,bam" multiple="True" label="BAM or BigWig files">
                    <validator type="empty_field"/>
                    <validator type="unspecified_build"/>
                </param>
                <param name="input_name_positions" type="select" display="radio" label="Selected input file name pattern is" help="A '-' character must separate cell type and epigenetic factor names within the selected input file names">
                    <option value="cell_first" selected="true">Cell type name - Epigenetic factor name</option>
                    <option value="cell_last">Epigenetic factor name - Cell type name</option>
                </param>
            </when>
            <when value="manual">
                <repeat name="input_repeat" title="Cell type, epigenetic factor and input" min="1">
                    <param name="cell_type_name" type="text" value="" label="Cell type name">
                        <validator type="empty_field"/>
                    </param>
                    <param name="epigenetic_factor_name" type="text" value="" label="Epigenetic factor name">
                        <validator type="empty_field"/>
                    </param>
                    <param name="input" type="data" format="bigwig,bam" label="BAM or BigWig file">
                        <validator type="empty_field"/>
                        <validator type="unspecified_build"/>
                    </param>
                </repeat>
            </when>
        </conditional>
        <conditional name="specify_chrom_windows_cond">
            <param name="specify_chrom_windows" type="select" label="Define chromosome window positions from a bed file?">
                <option value="no" selected="true">No</option>
                <option value="yes">Yes</option>
            </param>
            <when value="no">
                <param name="window_size" type="integer" value="200" label="Window size in base pairs"/>
                <conditional name="restrict_chromosomes_cond">
                    <param name="restrict_chromosomes" type="select" label="Restrict processing to specified chromosomes">
                        <option value="no" selected="true">No</option>
                        <option value="yes">Yes</option>
                    </param>
                    <when value="no"/>
                    <when value="yes">
                        <repeat name="chrom_repeat" title="Chromosomes" min="1">
                            <param name="chrom" type="text" value="" label="Chromosome" help="One chromosome (e.g., chr1, chr2, chrX) per text field"/>
                        </repeat>
                    </when>
                </conditional>
            </when>
            <when value="yes">
                <param name="chrom_bed_input" type="data" format="bed" label="Select bed file for defining chromosome window positions">
                    <validator type="empty_dataset"/>
                </param>
            </when>
        </conditional>
        <param name="bychr" type="boolean" truevalue="true" falsevalue="" checked="False" label="Output chromosomes in separate files"/>
        <param name="reads_per_bp" type="select" display="radio" label="Calculate the signal in each window using">
            <option value="6" selected="true">mean</option>
            <option value="8">max</option>
        </param>
        <param name="exclude_bed_input" type="data" format="bed" optional="True" multiple="True" label="Select bed file(s) containing regions to exclude"/>
        <param name="standardize_datasets" type="boolean" truevalue="true" falsevalue="" checked="False" label="Standardize all datasets"/>
    </inputs>
    <outputs>
        <data name="output" format="ideaspre"/>
    </outputs>
    <tests>
        <test>
            <param name="input" value="e001-h3k4me3.bigwig" ftype="bigwig" dbkey="hg19"/>
            <param name="specify_chrom_windows" value="yes"/>
            <param name="chrom_bed_input" value="chromosomes.bed" ftype="bed" dbkey="hg19"/>
            <output name="output" file="output.html" ftype="ideaspre">
                <extra_files type="file" name="chromosomes.bed" value="chromosomes.bed"/>
                <extra_files type="file" name="chromosome_windows.txt" value="chromosome_windows.txt"/>
                <extra_files type="file" name="IDEAS_input_config.txt" value="IDEAS_input_config.txt"/>
                <extra_files type="file" name="tmp.tar" value="tmp.tar" compare="sim_size"/>
            </output>
        </test>
    </tests>
    <help>
**What it does**

Takes as input a list of epigenetic data sets (histones, chromatin accessibility, CpG methylation, TFs, etc.)
or any other whole-genome data sets (e.g., scores). Currently the supported data formats are BigWig and BAM.
All data sets are mapped by to a common genomic coordinate in a selected assembly (user-provided window size
or 200bp windows by default). The user can specify regions to be considered or removed from the analysis.
The input data may come from one cell type/condition/individual/time point (although this approach does not
fully utilize the advantages of IDEAS), or from multiple cell types/conditions/individuals/time points. The
same set of epigenetic features may not be present in all cell types, in which case IDEAS perfroms imputation
of the missing tracks if specified.  This tool produces a single dataset with the **IdeasPre** datatype for
use as input to the IDEAS tool.

-----

**Required options**

* **Set cell type and epigenetic factor names by** - cell type and epigenetic factor names can be set manually or by extracting them from the names of the selected input datasets.  The latter case requires all selected datasets to have names that contain a "-" character.

 * **BAM or BigWig files** - select one or more Bam or Bigwig files from your history, making sure that the name of every selected input include a "-" character (e.g., e001-h3k4me3.bigwig).
 * **Cell type, Epigenetic factor and Input** - manually select any number of inputs, setting the cell type and epigenetic factor name for each.  The combination of "cell type name" and "epigenetic factor name" must be unique for each input.  For example, if you have replicate data you may want to specify the cell name as "rep1", "rep2", etc and the factor name as "rep1", "rep2", etc.

  * **Cell type name** - cell type name if specifying manually.
  * **Epigenetic factor name** - epigenetic factor name if specifying manually.
  * **BAM or BigWig file** - BAM or BigWig file.
  * **Selected input file name pattern is** - select the file name pattern, either **epigenetic factor name-cell type name** or **cell type name-epigenetic factor name**.

* **Define chromosome window positions from a bed file** - select "No" to run whole genome segmentation or select "Yes" to segment genomes within the unit of the windows defined by the bed file.  This file can be in BED3, BED4 or BED5 format, but only the first three columns (chr posst posed) will be used.

 * **Window size in base pairs** - Window size in base pairs if specifying manually.
 * **Restrict processing to specified chromosomes** - select "Yes" to restrict processing to specified chromosomes.

  * **Chromosomes** - enter a comma-separated list of chromosomes for processing.

 * **Select bed file for defining chromosome window positions** - select a bed file for specifying the chromosome window positions.

* **Output chromosomes in separate files** - select "Yes" to produce separate files for each chromosome, allowing you to run IDEAS on different chromosomes separately.
* **Calculate the signal in each window using** - use the bigWigAverageOverBed utility from the UCSC genome browser to calculate the signal (i.e., the number of reads per bp) in each window.
* **Select bed file(s) containing regions to exclude** - select one or more bed files that contains regions you'd like excluded from your datasets.
* **Standardize all datasets** - select "Yes" to standardize all datasets (e.g., reads / total_reads * 20 million) so that the signals from different cell types become comparable - your datasets can be read counts, logp-values or fold change.

    </help>
    <citations>
        <citation type="doi">10.1093/nar/gkw278</citation>
    </citations>
</tool>
author	greg
date	Wed, 31 Jan 2018 14:13:11 -0500
parents	237ee7319452
children	71345e154c66