diff diffbind.xml @ 0:18090d836604 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/diffbind commit f970dcbe9d0e4c3714b1db74c404ea34223cf8ed
author iuc
date Tue, 20 Mar 2018 04:51:01 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/diffbind.xml	Tue Mar 20 04:51:01 2018 -0400
@@ -0,0 +1,409 @@
+<tool id="diffbind" name="DiffBind" version="2.6.6.0">
+    <description> differential binding analysis of ChIP-Seq peak data</description>
+    <requirements>
+        <requirement type="package" version="2.6.6">bioconductor-diffbind</requirement>
+        <requirement type="package" version="1.20.0">r-getopt</requirement>
+    </requirements>
+    <stdio>
+        <regex match="Execution halted"
+           source="both"
+           level="fatal"
+           description="Execution halted." />
+        <regex match="Input-Error 01"
+           source="both"
+           level="fatal"
+           description="Error in your input parameters: Make sure you only apply factors to selected samples." />
+        <regex match="Error in"
+           source="both"
+           level="fatal"
+           description="An undefined error occured, please check your intput carefully and contact your administrator." />
+    </stdio>
+    <version_command><![CDATA[
+echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
+    ]]></version_command>
+    <command><![CDATA[
+        ## seems that diffbind also needs file extensions to work properly
+        #set $counter = 1
+        #for $sample in $samples:
+            ln -s $sample.bamreads #echo str($counter) + "_bamreads.bam"# &&
+            ln -s ${sample.bamreads.metadata.bam_index} #echo str($counter) + "_bamreads.bai"# &&
+            #if str( $sample.bamcontrol ) != 'None':
+                ln -s $sample.bamcontrol #echo str($counter) + "_bamcontrol.bam"# &&
+                ln -s ${sample.bamcontrol.metadata.bam_index} #echo str($counter) + "_bamcontrol.bai"# &&
+            #end if
+            #set $counter = $counter + 1
+        #end for
+
+        Rscript '$__tool_directory__/diffbind.R'
+            -i $infile
+            -o '$outfile'
+            -t $th
+            -f $out.format
+            -p '$plots'
+
+            #if $out.binding_matrix:
+                -b
+            #end if
+
+            #if $out.rdata:
+                -r
+            #end if
+]]>
+    </command>
+    <configfiles>
+<configfile name="infile"><![CDATA[
+#set $counter = 1
+#for $sample in $samples:
+#if str( $sample.bamcontrol ) != 'None' and $counter == 1:
+SampleID,Tissue,Factor,Condition,Replicate,bamReads,bamControl,Peaks
+#elif $counter == 1:
+SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks
+#end if
+#if str( $sample.bamcontrol ) != 'None':
+$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,#echo str($counter) + '_bamcontrol.bam'#,$sample.peaks
+#else:
+$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,$sample.peaks
+#end if
+#set $counter = $counter + 1
+#end for]]></configfile>
+    </configfiles>
+    <inputs>
+        <repeat name="samples" title="Samples" min="4">
+            <param name="sample_id" type="text" value="Sample ID" label="Specify a sample id" help="e.g. BT474.1-" />
+            <param name="tissue" type="text" value="Tissue" label="Specify the tissue" help="e.g. BT474" />
+            <param name="factor" type="text" value="Factor Name" label="Specify a factor name" help="e.g. ER" />
+            <param name="condition" type="text" value="Condition" label="Specify the condition" help="e.g. Resistent" />
+            <param name="replicate" type="integer" value="1" label="Specify the replicate number" help="e.g. 1" />
+            <param name="bamreads" type="data" format="bam" label="Read BAM file" help="Specify the Read BAM file, used for Peak calling."/>
+            <param name="bamcontrol" type="data" format="bam" optional="True" label="Control BAM file" help="If specifying a control BAM file for this sample, then all samples are required to specify one."/>
+            <param name="peaks" type="data" format="bed" label="Peak file" help="Result of your Peak calling experiment."/>
+        </repeat>
+        <param name="th" type="float" value="1" min="0" max="1"
+                label="FDR Threshold"
+                help="Significance threshold; all sites with FDR less than or equal to this value will be included in the report. A value of 1 will include all binding sites in the report. Default: 1"/>
+        
+        <!-- Output Options -->
+        <section name="out" expanded="false" title="Output Options">
+            <param name="format" type="select" label="Output Format">
+                <option value="bed">BED</option>
+                <option value="gff">GFF</option>
+                <option value="wig">WIG</option>
+            </param>
+            <param name="pdf" type="boolean" truevalue="True" falsevalue="" checked="False" label="Visualising the analysis results" help="output an additional PDF file" />
+            <param name="binding_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" />
+            <param name="rdata" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output RData file?" help="Output all the data used by R to construct the plots and tables, can be loaded into R. Default: No">
+            </param>
+        </section>
+    </inputs>
+
+    <outputs>
+        <data name="outfile" format="bed" label="${tool.name} on ${on_string}: Differentially bound sites">
+            <change_format>
+                <when input="format" value="wig" format="wig" />
+                <when input="format" value="gff" format="gff" />
+            </change_format>
+        </data>
+        <data name="plots" format="pdf" label="${tool.name} on ${on_string}: Plots">
+            <filter>out['pdf']</filter>
+        </data>
+        <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="${tool.name} on ${on_string}: Binding matrix">
+            <filter>out['binding_matrix']</filter>
+        </data>
+        <data name="rdata" format="rdata" from_work_dir="DiffBind_analysis.RData" label="${tool.name} on ${on_string}: RData file">
+            <filter>out['rdata']</filter>
+        </data>
+    </outputs>
+
+    <tests>
+        <test expect_num_outputs="4">
+            <repeat name="samples">
+                <param name="sample_id" value="BT4741" />
+                <param name="tissue" value="BT474" />
+                <param name="factor" value="ER" />
+                <param name="condition" value="Resistant" />
+                <param name="replicate" value="1" />
+                <param name="bamreads" ftype="bam" value="BT474_ER_1.bam" />
+                <param name="peaks" ftype="bed" value="BT474_ER_1.bed.gz" />
+            </repeat>
+            <repeat name="samples">
+                <param name="sample_id" value="BT4742" />
+                <param name="tissue" value="BT474" />
+                <param name="factor" value="ER" />
+                <param name="condition" value="Resistant" />
+                <param name="replicate" value="2" />
+                <param name="bamreads" ftype="bam" value="BT474_ER_2.bam" />
+                <param name="peaks" ftype="bed" value="BT474_ER_2.bed.gz" />
+            </repeat>
+            <repeat name="samples">
+                <param name="sample_id" value="MCF71" />
+                <param name="tissue" value="MCF7" />
+                <param name="factor" value="ER" />
+                <param name="condition" value="Responsive" />
+                <param name="replicate" value="1" />
+                <param name="bamreads" ftype="bam" value="MCF7_ER_1.bam" />
+                <param name="peaks" ftype="bed" value="MCF7_ER_1.bed.gz" />
+            </repeat>
+            <repeat name="samples">
+                <param name="sample_id" value="MCF72" />
+                <param name="tissue" value="MCF7" />
+                <param name="factor" value="ER" />
+                <param name="condition" value="Responsive"  />
+                <param name="replicate" value="2" />
+                <param name="bamreads" ftype="bam" value="MCF7_ER_2.bam" />
+                <param name="peaks" ftype="bed" value="MCF7_ER_2.bed.gz" />
+            </repeat>
+            <param name="pdf" value="True" />
+            <param name="binding_matrix" value="True" />
+            <param name="rdata" value="True" />
+            <output name="outfile" value="out_diffbind.bed" />
+            <output name="plots" value="out_plots.pdf" compare="sim_size" />
+            <output name="binding_matrix" value="out_binding.matrix" />
+            <output name="rdata" value="DiffBind_analysis.RData" compare="sim_size"/>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+.. class:: infomark
+
+**What it does**
+
+DiffBind_ is a `Bioconductor package`_ that provides functions for processing ChIP-Seq data enriched for genomic loci where specific
+protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and
+aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously,
+representing different ChIP experiments (antibodies, transcription factor and/or histone
+marks, experimental conditions, replicates) as well as managing the results of multiple peak
+callers.
+
+The primary emphasis of DiffBind is on identifying sites that are differentially bound
+between two sample groups. It includes functions to support the processing of peak sets,
+including overlapping and merging peak sets, counting sequencing reads overlapping intervals
+in peak sets, and identifying statistically significantly differentially bound sites based on
+evidence of binding affinity (measured by differences in read densities). To this end it uses
+statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages
+edgeR and DESeq2). Additionally, the package builds on Rgraphics routines to provide a
+set of standardized plots to aid in binding analysis.
+
+The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of
+examples: the first focusing on the core task of obtaining differentially bound sites based on
+affinity data, the second working through the main plotting routines, the third discussing the
+use of a blocking factor, and the fourth revisiting occupancy data (peak calls) in more detail,
+as well as comparing the results of an occupancy-based analysis with an affinity-based one.
+Finally, certain technical aspects of the how these analyses are accomplished are detailed.
+
+Note DiffBind requires a minimum of four samples (two groups with two replicates each).
+
+.. _DiffBind: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
+.. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
+.. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf
+
+-----
+
+**Inputs**
+
+DiffBind works primarily with peaksets, which are sets of genomic intervals representing
+candidate protein binding sites. Each interval consists of a chromosome, a start and end
+position, and usually a score of some type indicating confidence in, or strength of, the peak.
+Associated with each peakset are metadata relating to the experiment from which the peakset
+was derived. Additionally, files containing mapped sequencing reads (generally .bam files) can
+be associated with each peakset (one for the ChIP data, and optionally another representing
+a control sample)
+
+**Sample Information**
+
+You have to specify your sample information in the tool form above, where Condition contains the groups you want to compare.
+
+Example:
+
+    ============= ========== ========== ============= =============
+     **SampleID** **Tissue** **Factor** **Condition** **Replicate**
+    ------------- ---------- ---------- ------------- -------------
+    BT4741        BT474      ER         Resistant     1            
+    BT4742        BT474      ER         Resistant     2            
+    MCF71         MCF7       ER         Responsive    1            
+    MCF72         MCF7       ER         Responsive    2            
+    MCF73         MCF7       ER         Responsive    3            
+    T47D1         T47D       ER         Responsive    1            
+    T47D2         T47D       ER         Responsive    2            
+    MCF7r1        MCF7       ER         Resistant     1            
+    MCF7r2        MCF7       ER         Resistant     2            
+    ZR751         ZR75       ER         Responsive    1            
+    ZR752         ZR75       ER         Responsive    2            
+    ============= ========== ========== ============= =============
+
+
+**Peak files**
+
+Result of your Peak calling experiment in bed format, one file for each sample is required.
+
+Example:
+
+    ======= ======= ======= =============== =======
+    1          2      3          4           **5**
+    ======= ======= ======= =============== =======
+    chr18   215562  216063  MACS_peak_16037 56.11
+    chr18   311530  312105  MACS_peak_16038 222.49
+    chr18   356656  357315  MACS_peak_16039 92.06
+    chr18   371110  372092  MACS_peak_16040 123.86
+    chr18   395116  396464  MACS_peak_16041 1545.39
+    chr18   399014  400382  MACS_peak_16042 1835.19
+    chr18   499134  500200  MACS_peak_16043 748.32
+    chr18   503518  504552  MACS_peak_16044 818.30
+    chr18   531672  532274  MACS_peak_16045 159.30
+    chr18   568326  569282  MACS_peak_16046 601.11
+    ======= ======= ======= =============== =======
+
+* BAM file which contains the mapped sequencing reads can be associated with each peakset
+* Control BAM file represents a control dataset and are optional, but have to specified for all when used.
+
+-----
+
+**Outputs**
+
+This tool outputs
+
+    * differentially bound sites in BED, WIG or GFF format
+
+Optionally, under **Output Options** you can choose to output
+
+    * a correlation heatmap plot
+    * a binding affinity matrix
+    * an RData file
+
+**Differentially Bound Sites**
+
+As output format you can choose BED, GFF, WIG.
+
+Example - BED format:
+
+    =====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+    1      2       3       4     5     6       7       8       9       10          **11**
+    =====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+    chr18  394600  396513  1914    *   7.15    7.89    5.55    2.35    7.06e-24    9.84e-21
+    chr18  111567  112005  439     *   5.71    3.63    6.53    -2.89   1.27e-08    8.88e-06
+    chr18  346464  347342  879     *   5       3.24    5.77    -2.52   6.51e-06    0.00303
+    chr18  399014  400382  1369    *   7.62    8.05    7       1.04    1.04e-05    0.00364
+    chr18  371110  372102  993     *   4.63    5.36    3.07    2.3     8.1e-05     0.0226
+    =====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+
+    Columns contain the following data:
+
+* **1st**: Chromosome name
+* **2nd**: Start position of site
+* **3rd**: End position of site
+* **4th**: Length of site
+* **5th**: Strand
+* **6th**: Mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted)
+* **7th**: Mean concentration over the first (e.g. Resistant) group
+* **8th**: Mean concentration over second (e.g. Responsive) group
+* **9th**: Fold shows the difference in mean concentrations between the two groups (e.g. Resistant - Responsive), with a positive value indicating increased binding affinity in the first group and a negative value indicating increased binding affinity in the second group.
+* **10th**: P-value confidence measure for identifying these sites as differentially bound
+* **11th**: a multiple testing corrected FDR p-value
+
+
+**Binding Affinity Matrix**
+
+The final result of counting is a binding affinity matrix containing a (normalized) read count for each sample at every potential binding site. With this matrix, the samples can be re-clustered using affinity, rather than occupancy, data. The binding affinity matrix can be used for QC plotting as well as for subsequent
+differential analysis.
+
+Example:
+
+    ====== ====== ====== ========== ========== ========= ====== ========= ====
+    ID     Tissue Factor Condition  Treatment  Replicate Caller Intervals FRiP
+    ====== ====== ====== ========== ========== ========= ====== ========= ====
+    BT4741 BT474  ER     Resistant  Full-Media 1         counts 2845      0.16
+    BT4742 BT474  ER     Resistant  Full-Media 2         counts 2845      0.15
+    MCF71  MCF7   ER     Responsive Full-Media 1         counts 2845      0.27
+    MCF72  MCF7   ER     Responsive Full-Media 2         counts 2845      0.17
+    MCF73  MCF7   ER     Responsive Full-Media 3         counts 2845      0.23
+    T47D1  T47D   ER     Responsive Full-Media 1         counts 2845      0.10
+    T47D2  T47D   ER     Responsive Full-Media 2         counts 2845      0.06
+    MCF7r1 MCF7   ER     Resistant  Full-Media 1         counts 2845      0.20
+    MCF7r2 MCF7   ER     Resistant  Full-Media 2         counts 2845      0.13
+    ZR751  ZR75   ER     Responsive Full-Media 1         counts 2845      0.32
+    ZR752  ZR75   ER     Responsive Full-Media 2         counts 2845      0.22
+    ====== ====== ====== ========== ========== ========= ====== ========= ====
+
+-----
+
+**More Information**
+
+Generally, processing data with DiffBind involves five phases:
+
+ #. Reading in peaksets
+ #. Occupancy analysis
+ #. Counting reads
+ #. Differential binding affinity analysis
+ #. Plotting and reporting
+
+
+**Reading in peaksets**:
+
+The first step is to read in a set of peaksets and associated
+metadata. Peaksets are derived either from ChIP-Seq peak callers, such as **MACS2**, or using some other criterion (e.g. genomic windows, or all the promoter regions
+in a genome).  A single experiment can have more than
+one associated peakset; e.g. if multiple peak callers are used for comparison purposes
+each sample would have more than one line in the sample sheet. Once the peaksets
+are read in, a merging function finds all overlapping peaks and derives a single set of
+unique genomic intervals covering all the supplied peaks (a consensus peakset for the
+experiment).
+
+**Occupancy analysis**:
+
+Peaksets, especially those generated by peak callers, provide
+an insight into the potential occupancy of the protein being ChIPed for at specific
+genomic loci. After the peaksets have been loaded, it can be useful to perform some
+exploratory plotting to determine how these occupancy maps agree with each other,
+e.g. between experimental replicates (re-doing the ChIP under the same conditions),
+between different peak callers on the same experiment, and within groups of samples
+representing a common experimental condition. DiffBind provides functions to enable
+overlaps to be examined, as well as functions to determine how well similar samples
+cluster together. Beyond quality control, the product of an occupancy analysis may be
+a consensus peakset, representing an overall set of candidate binding sites to be used
+in further analysis.
+
+**Counting reads**:
+
+Once a consensus peakset has been derived, DiffBind can use the
+supplied sequence read files to count how many reads overlap each interval for each
+unique sample. The peaks in the consensus peakset may be re-centered and trimmed
+based on calculating their summits (point of greatest read overlap) in order to provide
+more standardized peak intervals. The final result of counting is a binding affinity matrix
+containing a (normalized) read count for each sample at every potential binding site.
+With this matrix, the samples can be re-clustered using affinity, rather than occupancy,
+data. The binding affinity matrix is used for QC plotting as well as for subsequent
+differential analysis.
+
+**Differential binding affinity analysis**:
+
+The core functionality of DiffBind is the
+differential binding affinity analysis, which enables binding sites to be identified that
+are statistically significantly differentially bound between sample groups. To accomplish
+this, first a contrast (or contrasts) is established, dividing the samples into groups to
+be compared. Next the core analysis routines are executed, by default using DESeq2 .
+This will assign a p-value and FDR to each candidate binding site indicating confidence
+that they are differentially bound.
+
+**Plotting and reporting**:
+
+Once one or more contrasts have been run, DiffBind provides
+a number of functions for reporting and plotting the results. MA plots give an
+overview of the results of the analysis, while correlation heatmaps and PCA plots show
+how the groups cluster based on differentially bound sites. Boxplots show the distribution
+of reads within differentially bound sites corresponding to whether they gain or
+lose affinity between the two sample groups. A reporting mechanism enables differentially
+bound sites to be extracted for further processing, such as annotation, motif, and
+pathway analyses. *Note that currently only the correlation plot is implemented in this Galaxy tool.*
+
+-----
+
+**References**
+
+DiffBind Authors:  Rory Stark, Gordon Brown (2011)
+Wrapper authors: Bjoern Gruening, Pavankumar Videm
+
+]]>
+    </help>
+    <citations>
+        <citation type="doi">doi:10.1038/nature10730</citation>
+    </citations>
+</tool>