Mercurial > repos > iuc > diffbind
diff diffbind.xml @ 0:18090d836604 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/diffbind commit f970dcbe9d0e4c3714b1db74c404ea34223cf8ed
| author | iuc |
|---|---|
| date | Tue, 20 Mar 2018 04:51:01 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/diffbind.xml Tue Mar 20 04:51:01 2018 -0400 @@ -0,0 +1,409 @@ +<tool id="diffbind" name="DiffBind" version="2.6.6.0"> + <description> differential binding analysis of ChIP-Seq peak data</description> + <requirements> + <requirement type="package" version="2.6.6">bioconductor-diffbind</requirement> + <requirement type="package" version="1.20.0">r-getopt</requirement> + </requirements> + <stdio> + <regex match="Execution halted" + source="both" + level="fatal" + description="Execution halted." /> + <regex match="Input-Error 01" + source="both" + level="fatal" + description="Error in your input parameters: Make sure you only apply factors to selected samples." /> + <regex match="Error in" + source="both" + level="fatal" + description="An undefined error occured, please check your intput carefully and contact your administrator." /> + </stdio> + <version_command><![CDATA[ +echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ") + ]]></version_command> + <command><![CDATA[ + ## seems that diffbind also needs file extensions to work properly + #set $counter = 1 + #for $sample in $samples: + ln -s $sample.bamreads #echo str($counter) + "_bamreads.bam"# && + ln -s ${sample.bamreads.metadata.bam_index} #echo str($counter) + "_bamreads.bai"# && + #if str( $sample.bamcontrol ) != 'None': + ln -s $sample.bamcontrol #echo str($counter) + "_bamcontrol.bam"# && + ln -s ${sample.bamcontrol.metadata.bam_index} #echo str($counter) + "_bamcontrol.bai"# && + #end if + #set $counter = $counter + 1 + #end for + + Rscript '$__tool_directory__/diffbind.R' + -i $infile + -o '$outfile' + -t $th + -f $out.format + -p '$plots' + + #if $out.binding_matrix: + -b + #end if + + #if $out.rdata: + -r + #end if +]]> + </command> + <configfiles> +<configfile name="infile"><![CDATA[ +#set $counter = 1 +#for $sample in $samples: +#if str( $sample.bamcontrol ) != 'None' and $counter == 1: +SampleID,Tissue,Factor,Condition,Replicate,bamReads,bamControl,Peaks +#elif $counter == 1: +SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks +#end if +#if str( $sample.bamcontrol ) != 'None': +$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,#echo str($counter) + '_bamcontrol.bam'#,$sample.peaks +#else: +$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,$sample.peaks +#end if +#set $counter = $counter + 1 +#end for]]></configfile> + </configfiles> + <inputs> + <repeat name="samples" title="Samples" min="4"> + <param name="sample_id" type="text" value="Sample ID" label="Specify a sample id" help="e.g. BT474.1-" /> + <param name="tissue" type="text" value="Tissue" label="Specify the tissue" help="e.g. BT474" /> + <param name="factor" type="text" value="Factor Name" label="Specify a factor name" help="e.g. ER" /> + <param name="condition" type="text" value="Condition" label="Specify the condition" help="e.g. Resistent" /> + <param name="replicate" type="integer" value="1" label="Specify the replicate number" help="e.g. 1" /> + <param name="bamreads" type="data" format="bam" label="Read BAM file" help="Specify the Read BAM file, used for Peak calling."/> + <param name="bamcontrol" type="data" format="bam" optional="True" label="Control BAM file" help="If specifying a control BAM file for this sample, then all samples are required to specify one."/> + <param name="peaks" type="data" format="bed" label="Peak file" help="Result of your Peak calling experiment."/> + </repeat> + <param name="th" type="float" value="1" min="0" max="1" + label="FDR Threshold" + help="Significance threshold; all sites with FDR less than or equal to this value will be included in the report. A value of 1 will include all binding sites in the report. Default: 1"/> + + <!-- Output Options --> + <section name="out" expanded="false" title="Output Options"> + <param name="format" type="select" label="Output Format"> + <option value="bed">BED</option> + <option value="gff">GFF</option> + <option value="wig">WIG</option> + </param> + <param name="pdf" type="boolean" truevalue="True" falsevalue="" checked="False" label="Visualising the analysis results" help="output an additional PDF file" /> + <param name="binding_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" /> + <param name="rdata" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output RData file?" help="Output all the data used by R to construct the plots and tables, can be loaded into R. Default: No"> + </param> + </section> + </inputs> + + <outputs> + <data name="outfile" format="bed" label="${tool.name} on ${on_string}: Differentially bound sites"> + <change_format> + <when input="format" value="wig" format="wig" /> + <when input="format" value="gff" format="gff" /> + </change_format> + </data> + <data name="plots" format="pdf" label="${tool.name} on ${on_string}: Plots"> + <filter>out['pdf']</filter> + </data> + <data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="${tool.name} on ${on_string}: Binding matrix"> + <filter>out['binding_matrix']</filter> + </data> + <data name="rdata" format="rdata" from_work_dir="DiffBind_analysis.RData" label="${tool.name} on ${on_string}: RData file"> + <filter>out['rdata']</filter> + </data> + </outputs> + + <tests> + <test expect_num_outputs="4"> + <repeat name="samples"> + <param name="sample_id" value="BT4741" /> + <param name="tissue" value="BT474" /> + <param name="factor" value="ER" /> + <param name="condition" value="Resistant" /> + <param name="replicate" value="1" /> + <param name="bamreads" ftype="bam" value="BT474_ER_1.bam" /> + <param name="peaks" ftype="bed" value="BT474_ER_1.bed.gz" /> + </repeat> + <repeat name="samples"> + <param name="sample_id" value="BT4742" /> + <param name="tissue" value="BT474" /> + <param name="factor" value="ER" /> + <param name="condition" value="Resistant" /> + <param name="replicate" value="2" /> + <param name="bamreads" ftype="bam" value="BT474_ER_2.bam" /> + <param name="peaks" ftype="bed" value="BT474_ER_2.bed.gz" /> + </repeat> + <repeat name="samples"> + <param name="sample_id" value="MCF71" /> + <param name="tissue" value="MCF7" /> + <param name="factor" value="ER" /> + <param name="condition" value="Responsive" /> + <param name="replicate" value="1" /> + <param name="bamreads" ftype="bam" value="MCF7_ER_1.bam" /> + <param name="peaks" ftype="bed" value="MCF7_ER_1.bed.gz" /> + </repeat> + <repeat name="samples"> + <param name="sample_id" value="MCF72" /> + <param name="tissue" value="MCF7" /> + <param name="factor" value="ER" /> + <param name="condition" value="Responsive" /> + <param name="replicate" value="2" /> + <param name="bamreads" ftype="bam" value="MCF7_ER_2.bam" /> + <param name="peaks" ftype="bed" value="MCF7_ER_2.bed.gz" /> + </repeat> + <param name="pdf" value="True" /> + <param name="binding_matrix" value="True" /> + <param name="rdata" value="True" /> + <output name="outfile" value="out_diffbind.bed" /> + <output name="plots" value="out_plots.pdf" compare="sim_size" /> + <output name="binding_matrix" value="out_binding.matrix" /> + <output name="rdata" value="DiffBind_analysis.RData" compare="sim_size"/> + </test> + </tests> + <help><![CDATA[ + +.. class:: infomark + +**What it does** + +DiffBind_ is a `Bioconductor package`_ that provides functions for processing ChIP-Seq data enriched for genomic loci where specific +protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and +aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously, +representing different ChIP experiments (antibodies, transcription factor and/or histone +marks, experimental conditions, replicates) as well as managing the results of multiple peak +callers. + +The primary emphasis of DiffBind is on identifying sites that are differentially bound +between two sample groups. It includes functions to support the processing of peak sets, +including overlapping and merging peak sets, counting sequencing reads overlapping intervals +in peak sets, and identifying statistically significantly differentially bound sites based on +evidence of binding affinity (measured by differences in read densities). To this end it uses +statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages +edgeR and DESeq2). Additionally, the package builds on Rgraphics routines to provide a +set of standardized plots to aid in binding analysis. + +The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of +examples: the first focusing on the core task of obtaining differentially bound sites based on +affinity data, the second working through the main plotting routines, the third discussing the +use of a blocking factor, and the fourth revisiting occupancy data (peak calls) in more detail, +as well as comparing the results of an occupancy-based analysis with an affinity-based one. +Finally, certain technical aspects of the how these analyses are accomplished are detailed. + +Note DiffBind requires a minimum of four samples (two groups with two replicates each). + +.. _DiffBind: https://bioconductor.org/packages/release/bioc/html/DiffBind.html +.. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html +.. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf + +----- + +**Inputs** + +DiffBind works primarily with peaksets, which are sets of genomic intervals representing +candidate protein binding sites. Each interval consists of a chromosome, a start and end +position, and usually a score of some type indicating confidence in, or strength of, the peak. +Associated with each peakset are metadata relating to the experiment from which the peakset +was derived. Additionally, files containing mapped sequencing reads (generally .bam files) can +be associated with each peakset (one for the ChIP data, and optionally another representing +a control sample) + +**Sample Information** + +You have to specify your sample information in the tool form above, where Condition contains the groups you want to compare. + +Example: + + ============= ========== ========== ============= ============= + **SampleID** **Tissue** **Factor** **Condition** **Replicate** + ------------- ---------- ---------- ------------- ------------- + BT4741 BT474 ER Resistant 1 + BT4742 BT474 ER Resistant 2 + MCF71 MCF7 ER Responsive 1 + MCF72 MCF7 ER Responsive 2 + MCF73 MCF7 ER Responsive 3 + T47D1 T47D ER Responsive 1 + T47D2 T47D ER Responsive 2 + MCF7r1 MCF7 ER Resistant 1 + MCF7r2 MCF7 ER Resistant 2 + ZR751 ZR75 ER Responsive 1 + ZR752 ZR75 ER Responsive 2 + ============= ========== ========== ============= ============= + + +**Peak files** + +Result of your Peak calling experiment in bed format, one file for each sample is required. + +Example: + + ======= ======= ======= =============== ======= + 1 2 3 4 **5** + ======= ======= ======= =============== ======= + chr18 215562 216063 MACS_peak_16037 56.11 + chr18 311530 312105 MACS_peak_16038 222.49 + chr18 356656 357315 MACS_peak_16039 92.06 + chr18 371110 372092 MACS_peak_16040 123.86 + chr18 395116 396464 MACS_peak_16041 1545.39 + chr18 399014 400382 MACS_peak_16042 1835.19 + chr18 499134 500200 MACS_peak_16043 748.32 + chr18 503518 504552 MACS_peak_16044 818.30 + chr18 531672 532274 MACS_peak_16045 159.30 + chr18 568326 569282 MACS_peak_16046 601.11 + ======= ======= ======= =============== ======= + +* BAM file which contains the mapped sequencing reads can be associated with each peakset +* Control BAM file represents a control dataset and are optional, but have to specified for all when used. + +----- + +**Outputs** + +This tool outputs + + * differentially bound sites in BED, WIG or GFF format + +Optionally, under **Output Options** you can choose to output + + * a correlation heatmap plot + * a binding affinity matrix + * an RData file + +**Differentially Bound Sites** + +As output format you can choose BED, GFF, WIG. + +Example - BED format: + + ===== ====== ====== ===== ==== ==== ==== ==== ===== ======== ======== + 1 2 3 4 5 6 7 8 9 10 **11** + ===== ====== ====== ===== ==== ==== ==== ==== ===== ======== ======== + chr18 394600 396513 1914 * 7.15 7.89 5.55 2.35 7.06e-24 9.84e-21 + chr18 111567 112005 439 * 5.71 3.63 6.53 -2.89 1.27e-08 8.88e-06 + chr18 346464 347342 879 * 5 3.24 5.77 -2.52 6.51e-06 0.00303 + chr18 399014 400382 1369 * 7.62 8.05 7 1.04 1.04e-05 0.00364 + chr18 371110 372102 993 * 4.63 5.36 3.07 2.3 8.1e-05 0.0226 + ===== ====== ====== ===== ==== ==== ==== ==== ===== ======== ======== + + Columns contain the following data: + +* **1st**: Chromosome name +* **2nd**: Start position of site +* **3rd**: End position of site +* **4th**: Length of site +* **5th**: Strand +* **6th**: Mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted) +* **7th**: Mean concentration over the first (e.g. Resistant) group +* **8th**: Mean concentration over second (e.g. Responsive) group +* **9th**: Fold shows the difference in mean concentrations between the two groups (e.g. Resistant - Responsive), with a positive value indicating increased binding affinity in the first group and a negative value indicating increased binding affinity in the second group. +* **10th**: P-value confidence measure for identifying these sites as differentially bound +* **11th**: a multiple testing corrected FDR p-value + + +**Binding Affinity Matrix** + +The final result of counting is a binding affinity matrix containing a (normalized) read count for each sample at every potential binding site. With this matrix, the samples can be re-clustered using affinity, rather than occupancy, data. The binding affinity matrix can be used for QC plotting as well as for subsequent +differential analysis. + +Example: + + ====== ====== ====== ========== ========== ========= ====== ========= ==== + ID Tissue Factor Condition Treatment Replicate Caller Intervals FRiP + ====== ====== ====== ========== ========== ========= ====== ========= ==== + BT4741 BT474 ER Resistant Full-Media 1 counts 2845 0.16 + BT4742 BT474 ER Resistant Full-Media 2 counts 2845 0.15 + MCF71 MCF7 ER Responsive Full-Media 1 counts 2845 0.27 + MCF72 MCF7 ER Responsive Full-Media 2 counts 2845 0.17 + MCF73 MCF7 ER Responsive Full-Media 3 counts 2845 0.23 + T47D1 T47D ER Responsive Full-Media 1 counts 2845 0.10 + T47D2 T47D ER Responsive Full-Media 2 counts 2845 0.06 + MCF7r1 MCF7 ER Resistant Full-Media 1 counts 2845 0.20 + MCF7r2 MCF7 ER Resistant Full-Media 2 counts 2845 0.13 + ZR751 ZR75 ER Responsive Full-Media 1 counts 2845 0.32 + ZR752 ZR75 ER Responsive Full-Media 2 counts 2845 0.22 + ====== ====== ====== ========== ========== ========= ====== ========= ==== + +----- + +**More Information** + +Generally, processing data with DiffBind involves five phases: + + #. Reading in peaksets + #. Occupancy analysis + #. Counting reads + #. Differential binding affinity analysis + #. Plotting and reporting + + +**Reading in peaksets**: + +The first step is to read in a set of peaksets and associated +metadata. Peaksets are derived either from ChIP-Seq peak callers, such as **MACS2**, or using some other criterion (e.g. genomic windows, or all the promoter regions +in a genome). A single experiment can have more than +one associated peakset; e.g. if multiple peak callers are used for comparison purposes +each sample would have more than one line in the sample sheet. Once the peaksets +are read in, a merging function finds all overlapping peaks and derives a single set of +unique genomic intervals covering all the supplied peaks (a consensus peakset for the +experiment). + +**Occupancy analysis**: + +Peaksets, especially those generated by peak callers, provide +an insight into the potential occupancy of the protein being ChIPed for at specific +genomic loci. After the peaksets have been loaded, it can be useful to perform some +exploratory plotting to determine how these occupancy maps agree with each other, +e.g. between experimental replicates (re-doing the ChIP under the same conditions), +between different peak callers on the same experiment, and within groups of samples +representing a common experimental condition. DiffBind provides functions to enable +overlaps to be examined, as well as functions to determine how well similar samples +cluster together. Beyond quality control, the product of an occupancy analysis may be +a consensus peakset, representing an overall set of candidate binding sites to be used +in further analysis. + +**Counting reads**: + +Once a consensus peakset has been derived, DiffBind can use the +supplied sequence read files to count how many reads overlap each interval for each +unique sample. The peaks in the consensus peakset may be re-centered and trimmed +based on calculating their summits (point of greatest read overlap) in order to provide +more standardized peak intervals. The final result of counting is a binding affinity matrix +containing a (normalized) read count for each sample at every potential binding site. +With this matrix, the samples can be re-clustered using affinity, rather than occupancy, +data. The binding affinity matrix is used for QC plotting as well as for subsequent +differential analysis. + +**Differential binding affinity analysis**: + +The core functionality of DiffBind is the +differential binding affinity analysis, which enables binding sites to be identified that +are statistically significantly differentially bound between sample groups. To accomplish +this, first a contrast (or contrasts) is established, dividing the samples into groups to +be compared. Next the core analysis routines are executed, by default using DESeq2 . +This will assign a p-value and FDR to each candidate binding site indicating confidence +that they are differentially bound. + +**Plotting and reporting**: + +Once one or more contrasts have been run, DiffBind provides +a number of functions for reporting and plotting the results. MA plots give an +overview of the results of the analysis, while correlation heatmaps and PCA plots show +how the groups cluster based on differentially bound sites. Boxplots show the distribution +of reads within differentially bound sites corresponding to whether they gain or +lose affinity between the two sample groups. A reporting mechanism enables differentially +bound sites to be extracted for further processing, such as annotation, motif, and +pathway analyses. *Note that currently only the correlation plot is implemented in this Galaxy tool.* + +----- + +**References** + +DiffBind Authors: Rory Stark, Gordon Brown (2011) +Wrapper authors: Bjoern Gruening, Pavankumar Videm + +]]> + </help> + <citations> + <citation type="doi">doi:10.1038/nature10730</citation> + </citations> +</tool>
