comparison diffbind.xml @ 0:18090d836604 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/diffbind commit f970dcbe9d0e4c3714b1db74c404ea34223cf8ed
author iuc
date Tue, 20 Mar 2018 04:51:01 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:18090d836604
1 <tool id="diffbind" name="DiffBind" version="2.6.6.0">
2 <description> differential binding analysis of ChIP-Seq peak data</description>
3 <requirements>
4 <requirement type="package" version="2.6.6">bioconductor-diffbind</requirement>
5 <requirement type="package" version="1.20.0">r-getopt</requirement>
6 </requirements>
7 <stdio>
8 <regex match="Execution halted"
9 source="both"
10 level="fatal"
11 description="Execution halted." />
12 <regex match="Input-Error 01"
13 source="both"
14 level="fatal"
15 description="Error in your input parameters: Make sure you only apply factors to selected samples." />
16 <regex match="Error in"
17 source="both"
18 level="fatal"
19 description="An undefined error occured, please check your intput carefully and contact your administrator." />
20 </stdio>
21 <version_command><![CDATA[
22 echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
23 ]]></version_command>
24 <command><![CDATA[
25 ## seems that diffbind also needs file extensions to work properly
26 #set $counter = 1
27 #for $sample in $samples:
28 ln -s $sample.bamreads #echo str($counter) + "_bamreads.bam"# &&
29 ln -s ${sample.bamreads.metadata.bam_index} #echo str($counter) + "_bamreads.bai"# &&
30 #if str( $sample.bamcontrol ) != 'None':
31 ln -s $sample.bamcontrol #echo str($counter) + "_bamcontrol.bam"# &&
32 ln -s ${sample.bamcontrol.metadata.bam_index} #echo str($counter) + "_bamcontrol.bai"# &&
33 #end if
34 #set $counter = $counter + 1
35 #end for
36
37 Rscript '$__tool_directory__/diffbind.R'
38 -i $infile
39 -o '$outfile'
40 -t $th
41 -f $out.format
42 -p '$plots'
43
44 #if $out.binding_matrix:
45 -b
46 #end if
47
48 #if $out.rdata:
49 -r
50 #end if
51 ]]>
52 </command>
53 <configfiles>
54 <configfile name="infile"><![CDATA[
55 #set $counter = 1
56 #for $sample in $samples:
57 #if str( $sample.bamcontrol ) != 'None' and $counter == 1:
58 SampleID,Tissue,Factor,Condition,Replicate,bamReads,bamControl,Peaks
59 #elif $counter == 1:
60 SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks
61 #end if
62 #if str( $sample.bamcontrol ) != 'None':
63 $sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,#echo str($counter) + '_bamcontrol.bam'#,$sample.peaks
64 #else:
65 $sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,$sample.peaks
66 #end if
67 #set $counter = $counter + 1
68 #end for]]></configfile>
69 </configfiles>
70 <inputs>
71 <repeat name="samples" title="Samples" min="4">
72 <param name="sample_id" type="text" value="Sample ID" label="Specify a sample id" help="e.g. BT474.1-" />
73 <param name="tissue" type="text" value="Tissue" label="Specify the tissue" help="e.g. BT474" />
74 <param name="factor" type="text" value="Factor Name" label="Specify a factor name" help="e.g. ER" />
75 <param name="condition" type="text" value="Condition" label="Specify the condition" help="e.g. Resistent" />
76 <param name="replicate" type="integer" value="1" label="Specify the replicate number" help="e.g. 1" />
77 <param name="bamreads" type="data" format="bam" label="Read BAM file" help="Specify the Read BAM file, used for Peak calling."/>
78 <param name="bamcontrol" type="data" format="bam" optional="True" label="Control BAM file" help="If specifying a control BAM file for this sample, then all samples are required to specify one."/>
79 <param name="peaks" type="data" format="bed" label="Peak file" help="Result of your Peak calling experiment."/>
80 </repeat>
81 <param name="th" type="float" value="1" min="0" max="1"
82 label="FDR Threshold"
83 help="Significance threshold; all sites with FDR less than or equal to this value will be included in the report. A value of 1 will include all binding sites in the report. Default: 1"/>
84
85 <!-- Output Options -->
86 <section name="out" expanded="false" title="Output Options">
87 <param name="format" type="select" label="Output Format">
88 <option value="bed">BED</option>
89 <option value="gff">GFF</option>
90 <option value="wig">WIG</option>
91 </param>
92 <param name="pdf" type="boolean" truevalue="True" falsevalue="" checked="False" label="Visualising the analysis results" help="output an additional PDF file" />
93 <param name="binding_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" />
94 <param name="rdata" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output RData file?" help="Output all the data used by R to construct the plots and tables, can be loaded into R. Default: No">
95 </param>
96 </section>
97 </inputs>
98
99 <outputs>
100 <data name="outfile" format="bed" label="${tool.name} on ${on_string}: Differentially bound sites">
101 <change_format>
102 <when input="format" value="wig" format="wig" />
103 <when input="format" value="gff" format="gff" />
104 </change_format>
105 </data>
106 <data name="plots" format="pdf" label="${tool.name} on ${on_string}: Plots">
107 <filter>out['pdf']</filter>
108 </data>
109 <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="${tool.name} on ${on_string}: Binding matrix">
110 <filter>out['binding_matrix']</filter>
111 </data>
112 <data name="rdata" format="rdata" from_work_dir="DiffBind_analysis.RData" label="${tool.name} on ${on_string}: RData file">
113 <filter>out['rdata']</filter>
114 </data>
115 </outputs>
116
117 <tests>
118 <test expect_num_outputs="4">
119 <repeat name="samples">
120 <param name="sample_id" value="BT4741" />
121 <param name="tissue" value="BT474" />
122 <param name="factor" value="ER" />
123 <param name="condition" value="Resistant" />
124 <param name="replicate" value="1" />
125 <param name="bamreads" ftype="bam" value="BT474_ER_1.bam" />
126 <param name="peaks" ftype="bed" value="BT474_ER_1.bed.gz" />
127 </repeat>
128 <repeat name="samples">
129 <param name="sample_id" value="BT4742" />
130 <param name="tissue" value="BT474" />
131 <param name="factor" value="ER" />
132 <param name="condition" value="Resistant" />
133 <param name="replicate" value="2" />
134 <param name="bamreads" ftype="bam" value="BT474_ER_2.bam" />
135 <param name="peaks" ftype="bed" value="BT474_ER_2.bed.gz" />
136 </repeat>
137 <repeat name="samples">
138 <param name="sample_id" value="MCF71" />
139 <param name="tissue" value="MCF7" />
140 <param name="factor" value="ER" />
141 <param name="condition" value="Responsive" />
142 <param name="replicate" value="1" />
143 <param name="bamreads" ftype="bam" value="MCF7_ER_1.bam" />
144 <param name="peaks" ftype="bed" value="MCF7_ER_1.bed.gz" />
145 </repeat>
146 <repeat name="samples">
147 <param name="sample_id" value="MCF72" />
148 <param name="tissue" value="MCF7" />
149 <param name="factor" value="ER" />
150 <param name="condition" value="Responsive" />
151 <param name="replicate" value="2" />
152 <param name="bamreads" ftype="bam" value="MCF7_ER_2.bam" />
153 <param name="peaks" ftype="bed" value="MCF7_ER_2.bed.gz" />
154 </repeat>
155 <param name="pdf" value="True" />
156 <param name="binding_matrix" value="True" />
157 <param name="rdata" value="True" />
158 <output name="outfile" value="out_diffbind.bed" />
159 <output name="plots" value="out_plots.pdf" compare="sim_size" />
160 <output name="binding_matrix" value="out_binding.matrix" />
161 <output name="rdata" value="DiffBind_analysis.RData" compare="sim_size"/>
162 </test>
163 </tests>
164 <help><![CDATA[
165
166 .. class:: infomark
167
168 **What it does**
169
170 DiffBind_ is a `Bioconductor package`_ that provides functions for processing ChIP-Seq data enriched for genomic loci where specific
171 protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and
172 aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously,
173 representing different ChIP experiments (antibodies, transcription factor and/or histone
174 marks, experimental conditions, replicates) as well as managing the results of multiple peak
175 callers.
176
177 The primary emphasis of DiffBind is on identifying sites that are differentially bound
178 between two sample groups. It includes functions to support the processing of peak sets,
179 including overlapping and merging peak sets, counting sequencing reads overlapping intervals
180 in peak sets, and identifying statistically significantly differentially bound sites based on
181 evidence of binding affinity (measured by differences in read densities). To this end it uses
182 statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages
183 edgeR and DESeq2). Additionally, the package builds on Rgraphics routines to provide a
184 set of standardized plots to aid in binding analysis.
185
186 The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of
187 examples: the first focusing on the core task of obtaining differentially bound sites based on
188 affinity data, the second working through the main plotting routines, the third discussing the
189 use of a blocking factor, and the fourth revisiting occupancy data (peak calls) in more detail,
190 as well as comparing the results of an occupancy-based analysis with an affinity-based one.
191 Finally, certain technical aspects of the how these analyses are accomplished are detailed.
192
193 Note DiffBind requires a minimum of four samples (two groups with two replicates each).
194
195 .. _DiffBind: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
196 .. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
197 .. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf
198
199 -----
200
201 **Inputs**
202
203 DiffBind works primarily with peaksets, which are sets of genomic intervals representing
204 candidate protein binding sites. Each interval consists of a chromosome, a start and end
205 position, and usually a score of some type indicating confidence in, or strength of, the peak.
206 Associated with each peakset are metadata relating to the experiment from which the peakset
207 was derived. Additionally, files containing mapped sequencing reads (generally .bam files) can
208 be associated with each peakset (one for the ChIP data, and optionally another representing
209 a control sample)
210
211 **Sample Information**
212
213 You have to specify your sample information in the tool form above, where Condition contains the groups you want to compare.
214
215 Example:
216
217 ============= ========== ========== ============= =============
218 **SampleID** **Tissue** **Factor** **Condition** **Replicate**
219 ------------- ---------- ---------- ------------- -------------
220 BT4741 BT474 ER Resistant 1
221 BT4742 BT474 ER Resistant 2
222 MCF71 MCF7 ER Responsive 1
223 MCF72 MCF7 ER Responsive 2
224 MCF73 MCF7 ER Responsive 3
225 T47D1 T47D ER Responsive 1
226 T47D2 T47D ER Responsive 2
227 MCF7r1 MCF7 ER Resistant 1
228 MCF7r2 MCF7 ER Resistant 2
229 ZR751 ZR75 ER Responsive 1
230 ZR752 ZR75 ER Responsive 2
231 ============= ========== ========== ============= =============
232
233
234 **Peak files**
235
236 Result of your Peak calling experiment in bed format, one file for each sample is required.
237
238 Example:
239
240 ======= ======= ======= =============== =======
241 1 2 3 4 **5**
242 ======= ======= ======= =============== =======
243 chr18 215562 216063 MACS_peak_16037 56.11
244 chr18 311530 312105 MACS_peak_16038 222.49
245 chr18 356656 357315 MACS_peak_16039 92.06
246 chr18 371110 372092 MACS_peak_16040 123.86
247 chr18 395116 396464 MACS_peak_16041 1545.39
248 chr18 399014 400382 MACS_peak_16042 1835.19
249 chr18 499134 500200 MACS_peak_16043 748.32
250 chr18 503518 504552 MACS_peak_16044 818.30
251 chr18 531672 532274 MACS_peak_16045 159.30
252 chr18 568326 569282 MACS_peak_16046 601.11
253 ======= ======= ======= =============== =======
254
255 * BAM file which contains the mapped sequencing reads can be associated with each peakset
256 * Control BAM file represents a control dataset and are optional, but have to specified for all when used.
257
258 -----
259
260 **Outputs**
261
262 This tool outputs
263
264 * differentially bound sites in BED, WIG or GFF format
265
266 Optionally, under **Output Options** you can choose to output
267
268 * a correlation heatmap plot
269 * a binding affinity matrix
270 * an RData file
271
272 **Differentially Bound Sites**
273
274 As output format you can choose BED, GFF, WIG.
275
276 Example - BED format:
277
278 ===== ====== ====== ===== ==== ==== ==== ==== ===== ======== ========
279 1 2 3 4 5 6 7 8 9 10 **11**
280 ===== ====== ====== ===== ==== ==== ==== ==== ===== ======== ========
281 chr18 394600 396513 1914 * 7.15 7.89 5.55 2.35 7.06e-24 9.84e-21
282 chr18 111567 112005 439 * 5.71 3.63 6.53 -2.89 1.27e-08 8.88e-06
283 chr18 346464 347342 879 * 5 3.24 5.77 -2.52 6.51e-06 0.00303
284 chr18 399014 400382 1369 * 7.62 8.05 7 1.04 1.04e-05 0.00364
285 chr18 371110 372102 993 * 4.63 5.36 3.07 2.3 8.1e-05 0.0226
286 ===== ====== ====== ===== ==== ==== ==== ==== ===== ======== ========
287
288 Columns contain the following data:
289
290 * **1st**: Chromosome name
291 * **2nd**: Start position of site
292 * **3rd**: End position of site
293 * **4th**: Length of site
294 * **5th**: Strand
295 * **6th**: Mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted)
296 * **7th**: Mean concentration over the first (e.g. Resistant) group
297 * **8th**: Mean concentration over second (e.g. Responsive) group
298 * **9th**: Fold shows the difference in mean concentrations between the two groups (e.g. Resistant - Responsive), with a positive value indicating increased binding affinity in the first group and a negative value indicating increased binding affinity in the second group.
299 * **10th**: P-value confidence measure for identifying these sites as differentially bound
300 * **11th**: a multiple testing corrected FDR p-value
301
302
303 **Binding Affinity Matrix**
304
305 The final result of counting is a binding affinity matrix containing a (normalized) read count for each sample at every potential binding site. With this matrix, the samples can be re-clustered using affinity, rather than occupancy, data. The binding affinity matrix can be used for QC plotting as well as for subsequent
306 differential analysis.
307
308 Example:
309
310 ====== ====== ====== ========== ========== ========= ====== ========= ====
311 ID Tissue Factor Condition Treatment Replicate Caller Intervals FRiP
312 ====== ====== ====== ========== ========== ========= ====== ========= ====
313 BT4741 BT474 ER Resistant Full-Media 1 counts 2845 0.16
314 BT4742 BT474 ER Resistant Full-Media 2 counts 2845 0.15
315 MCF71 MCF7 ER Responsive Full-Media 1 counts 2845 0.27
316 MCF72 MCF7 ER Responsive Full-Media 2 counts 2845 0.17
317 MCF73 MCF7 ER Responsive Full-Media 3 counts 2845 0.23
318 T47D1 T47D ER Responsive Full-Media 1 counts 2845 0.10
319 T47D2 T47D ER Responsive Full-Media 2 counts 2845 0.06
320 MCF7r1 MCF7 ER Resistant Full-Media 1 counts 2845 0.20
321 MCF7r2 MCF7 ER Resistant Full-Media 2 counts 2845 0.13
322 ZR751 ZR75 ER Responsive Full-Media 1 counts 2845 0.32
323 ZR752 ZR75 ER Responsive Full-Media 2 counts 2845 0.22
324 ====== ====== ====== ========== ========== ========= ====== ========= ====
325
326 -----
327
328 **More Information**
329
330 Generally, processing data with DiffBind involves five phases:
331
332 #. Reading in peaksets
333 #. Occupancy analysis
334 #. Counting reads
335 #. Differential binding affinity analysis
336 #. Plotting and reporting
337
338
339 **Reading in peaksets**:
340
341 The first step is to read in a set of peaksets and associated
342 metadata. Peaksets are derived either from ChIP-Seq peak callers, such as **MACS2**, or using some other criterion (e.g. genomic windows, or all the promoter regions
343 in a genome). A single experiment can have more than
344 one associated peakset; e.g. if multiple peak callers are used for comparison purposes
345 each sample would have more than one line in the sample sheet. Once the peaksets
346 are read in, a merging function finds all overlapping peaks and derives a single set of
347 unique genomic intervals covering all the supplied peaks (a consensus peakset for the
348 experiment).
349
350 **Occupancy analysis**:
351
352 Peaksets, especially those generated by peak callers, provide
353 an insight into the potential occupancy of the protein being ChIPed for at specific
354 genomic loci. After the peaksets have been loaded, it can be useful to perform some
355 exploratory plotting to determine how these occupancy maps agree with each other,
356 e.g. between experimental replicates (re-doing the ChIP under the same conditions),
357 between different peak callers on the same experiment, and within groups of samples
358 representing a common experimental condition. DiffBind provides functions to enable
359 overlaps to be examined, as well as functions to determine how well similar samples
360 cluster together. Beyond quality control, the product of an occupancy analysis may be
361 a consensus peakset, representing an overall set of candidate binding sites to be used
362 in further analysis.
363
364 **Counting reads**:
365
366 Once a consensus peakset has been derived, DiffBind can use the
367 supplied sequence read files to count how many reads overlap each interval for each
368 unique sample. The peaks in the consensus peakset may be re-centered and trimmed
369 based on calculating their summits (point of greatest read overlap) in order to provide
370 more standardized peak intervals. The final result of counting is a binding affinity matrix
371 containing a (normalized) read count for each sample at every potential binding site.
372 With this matrix, the samples can be re-clustered using affinity, rather than occupancy,
373 data. The binding affinity matrix is used for QC plotting as well as for subsequent
374 differential analysis.
375
376 **Differential binding affinity analysis**:
377
378 The core functionality of DiffBind is the
379 differential binding affinity analysis, which enables binding sites to be identified that
380 are statistically significantly differentially bound between sample groups. To accomplish
381 this, first a contrast (or contrasts) is established, dividing the samples into groups to
382 be compared. Next the core analysis routines are executed, by default using DESeq2 .
383 This will assign a p-value and FDR to each candidate binding site indicating confidence
384 that they are differentially bound.
385
386 **Plotting and reporting**:
387
388 Once one or more contrasts have been run, DiffBind provides
389 a number of functions for reporting and plotting the results. MA plots give an
390 overview of the results of the analysis, while correlation heatmaps and PCA plots show
391 how the groups cluster based on differentially bound sites. Boxplots show the distribution
392 of reads within differentially bound sites corresponding to whether they gain or
393 lose affinity between the two sample groups. A reporting mechanism enables differentially
394 bound sites to be extracted for further processing, such as annotation, motif, and
395 pathway analyses. *Note that currently only the correlation plot is implemented in this Galaxy tool.*
396
397 -----
398
399 **References**
400
401 DiffBind Authors: Rory Stark, Gordon Brown (2011)
402 Wrapper authors: Bjoern Gruening, Pavankumar Videm
403
404 ]]>
405 </help>
406 <citations>
407 <citation type="doi">doi:10.1038/nature10730</citation>
408 </citations>
409 </tool>