diffbind: diffbind.xml comparison

comparison diffbind.xml @ 0:18090d836604 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/diffbind commit f970dcbe9d0e4c3714b1db74c404ea34223cf8ed

author	iuc
date	Tue, 20 Mar 2018 04:51:01 -0400
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:18090d836604
+<tool id="diffbind" name="DiffBind" version="2.6.6.0">
+<description> differential binding analysis of ChIP-Seq peak data</description>
+<requirements>
+<requirement type="package" version="2.6.6">bioconductor-diffbind</requirement>
+<requirement type="package" version="1.20.0">r-getopt</requirement>
+</requirements>
+<stdio>
+<regex match="Execution halted"
+source="both"
+level="fatal"
+description="Execution halted." />
+<regex match="Input-Error 01"
+source="both"
+level="fatal"
+description="Error in your input parameters: Make sure you only apply factors to selected samples." />
+<regex match="Error in"
+source="both"
+level="fatal"
+description="An undefined error occured, please check your intput carefully and contact your administrator." />
+</stdio>
+<version_command><![CDATA[
+echo $(R --version | grep version | grep -v GNU)", DiffBind version" $(R --vanilla --slave -e "library(DiffBind); cat(sessionInfo()\$otherPkgs\$DiffBind\$Version)" 2> /dev/null | grep -v -i "WARNING: ")
+]]></version_command>
+<command><![CDATA[
+## seems that diffbind also needs file extensions to work properly
+#set $counter = 1
+#for $sample in $samples:
+ln -s $sample.bamreads #echo str($counter) + "_bamreads.bam"# &&
+ln -s ${sample.bamreads.metadata.bam_index} #echo str($counter) + "_bamreads.bai"# &&
+#if str( $sample.bamcontrol ) != 'None':
+ln -s $sample.bamcontrol #echo str($counter) + "_bamcontrol.bam"# &&
+ln -s ${sample.bamcontrol.metadata.bam_index} #echo str($counter) + "_bamcontrol.bai"# &&
+#end if
+#set $counter = $counter + 1
+#end for
+Rscript '$__tool_directory__/diffbind.R'
+-i $infile
+-o '$outfile'
+-t $th
+-f $out.format
+-p '$plots'
+#if $out.binding_matrix:
+-b
+#end if
+#if $out.rdata:
+-r
+#end if
+]]>
+</command>
+<configfiles>
+<configfile name="infile"><![CDATA[
+#set $counter = 1
+#for $sample in $samples:
+#if str( $sample.bamcontrol ) != 'None' and $counter == 1:
+SampleID,Tissue,Factor,Condition,Replicate,bamReads,bamControl,Peaks
+#elif $counter == 1:
+SampleID,Tissue,Factor,Condition,Replicate,bamReads,Peaks
+#end if
+#if str( $sample.bamcontrol ) != 'None':
+$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,#echo str($counter) + '_bamcontrol.bam'#,$sample.peaks
+#else:
+$sample.sample_id,$sample.tissue,$sample.factor,$sample.condition,$sample.replicate,#echo str($counter) + '_bamreads.bam'#,$sample.peaks
+#end if
+#set $counter = $counter + 1
+#end for]]></configfile>
+</configfiles>
+<inputs>
+<repeat name="samples" title="Samples" min="4">
+<param name="sample_id" type="text" value="Sample ID" label="Specify a sample id" help="e.g. BT474.1-" />
+<param name="tissue" type="text" value="Tissue" label="Specify the tissue" help="e.g. BT474" />
+<param name="factor" type="text" value="Factor Name" label="Specify a factor name" help="e.g. ER" />
+<param name="condition" type="text" value="Condition" label="Specify the condition" help="e.g. Resistent" />
+<param name="replicate" type="integer" value="1" label="Specify the replicate number" help="e.g. 1" />
+<param name="bamreads" type="data" format="bam" label="Read BAM file" help="Specify the Read BAM file, used for Peak calling."/>
+<param name="bamcontrol" type="data" format="bam" optional="True" label="Control BAM file" help="If specifying a control BAM file for this sample, then all samples are required to specify one."/>
+<param name="peaks" type="data" format="bed" label="Peak file" help="Result of your Peak calling experiment."/>
+</repeat>
+<param name="th" type="float" value="1" min="0" max="1"
+label="FDR Threshold"
+help="Significance threshold; all sites with FDR less than or equal to this value will be included in the report. A value of 1 will include all binding sites in the report. Default: 1"/>
+<!-- Output Options -->
+<section name="out" expanded="false" title="Output Options">
+<param name="format" type="select" label="Output Format">
+<option value="bed">BED</option>
+<option value="gff">GFF</option>
+<option value="wig">WIG</option>
+</param>
+<param name="pdf" type="boolean" truevalue="True" falsevalue="" checked="False" label="Visualising the analysis results" help="output an additional PDF file" />
+<param name="binding_matrix" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output binding affinity matrix?" help="Output a table of the binding scores" />
+<param name="rdata" type="boolean" truevalue="True" falsevalue="" checked="False" label="Output RData file?" help="Output all the data used by R to construct the plots and tables, can be loaded into R. Default: No">
+</param>
+</section>
+</inputs>
+<outputs>
+<data name="outfile" format="bed" label="${tool.name} on ${on_string}: Differentially bound sites">
+<change_format>
+<when input="format" value="wig" format="wig" />
+<when input="format" value="gff" format="gff" />
+</change_format>
+</data>
+<data name="plots" format="pdf" label="${tool.name} on ${on_string}: Plots">
+<filter>out['pdf']</filter>
+</data>
+<data name="binding_matrix" format="tabular" from_work_dir="bmatrix.tab" label="${tool.name} on ${on_string}: Binding matrix">
+<filter>out['binding_matrix']</filter>
+</data>
+<data name="rdata" format="rdata" from_work_dir="DiffBind_analysis.RData" label="${tool.name} on ${on_string}: RData file">
+<filter>out['rdata']</filter>
+</data>
+</outputs>
+<tests>
+<test expect_num_outputs="4">
+<repeat name="samples">
+<param name="sample_id" value="BT4741" />
+<param name="tissue" value="BT474" />
+<param name="factor" value="ER" />
+<param name="condition" value="Resistant" />
+<param name="replicate" value="1" />
+<param name="bamreads" ftype="bam" value="BT474_ER_1.bam" />
+<param name="peaks" ftype="bed" value="BT474_ER_1.bed.gz" />
+</repeat>
+<repeat name="samples">
+<param name="sample_id" value="BT4742" />
+<param name="tissue" value="BT474" />
+<param name="factor" value="ER" />
+<param name="condition" value="Resistant" />
+<param name="replicate" value="2" />
+<param name="bamreads" ftype="bam" value="BT474_ER_2.bam" />
+<param name="peaks" ftype="bed" value="BT474_ER_2.bed.gz" />
+</repeat>
+<repeat name="samples">
+<param name="sample_id" value="MCF71" />
+<param name="tissue" value="MCF7" />
+<param name="factor" value="ER" />
+<param name="condition" value="Responsive" />
+<param name="replicate" value="1" />
+<param name="bamreads" ftype="bam" value="MCF7_ER_1.bam" />
+<param name="peaks" ftype="bed" value="MCF7_ER_1.bed.gz" />
+</repeat>
+<repeat name="samples">
+<param name="sample_id" value="MCF72" />
+<param name="tissue" value="MCF7" />
+<param name="factor" value="ER" />
+<param name="condition" value="Responsive"  />
+<param name="replicate" value="2" />
+<param name="bamreads" ftype="bam" value="MCF7_ER_2.bam" />
+<param name="peaks" ftype="bed" value="MCF7_ER_2.bed.gz" />
+</repeat>
+<param name="pdf" value="True" />
+<param name="binding_matrix" value="True" />
+<param name="rdata" value="True" />
+<output name="outfile" value="out_diffbind.bed" />
+<output name="plots" value="out_plots.pdf" compare="sim_size" />
+<output name="binding_matrix" value="out_binding.matrix" />
+<output name="rdata" value="DiffBind_analysis.RData" compare="sim_size"/>
+</test>
+</tests>
+<help><![CDATA[
+.. class:: infomark
+**What it does**
+DiffBind_ is a `Bioconductor package`_ that provides functions for processing ChIP-Seq data enriched for genomic loci where specific
+protein/DNA binding occurs, including peak sets identified by ChIP-Seq peak callers and
+aligned sequence read datasets. It is designed to work with multiple peak sets simultaneously,
+representing different ChIP experiments (antibodies, transcription factor and/or histone
+marks, experimental conditions, replicates) as well as managing the results of multiple peak
+callers.
+The primary emphasis of DiffBind is on identifying sites that are differentially bound
+between two sample groups. It includes functions to support the processing of peak sets,
+including overlapping and merging peak sets, counting sequencing reads overlapping intervals
+in peak sets, and identifying statistically significantly differentially bound sites based on
+evidence of binding affinity (measured by differences in read densities). To this end it uses
+statistical routines developed in an RNA-Seq context (primarily the Bioconductor packages
+edgeR and DESeq2). Additionally, the package builds on Rgraphics routines to provide a
+set of standardized plots to aid in binding analysis.
+The `DiffBind User Guide`_ includes a brief overview of the processing flow, followed by four sections of
+examples: the first focusing on the core task of obtaining differentially bound sites based on
+affinity data, the second working through the main plotting routines, the third discussing the
+use of a blocking factor, and the fourth revisiting occupancy data (peak calls) in more detail,
+as well as comparing the results of an occupancy-based analysis with an affinity-based one.
+Finally, certain technical aspects of the how these analyses are accomplished are detailed.
+Note DiffBind requires a minimum of four samples (two groups with two replicates each).
+.. _DiffBind: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
+.. _`Bioconductor package`: https://bioconductor.org/packages/release/bioc/html/DiffBind.html
+.. _`DiffBind User Guide`: https://bioconductor.org/packages/release/bioc/vignettes/DiffBind/inst/doc/DiffBind.pdf
+-----
+**Inputs**
+DiffBind works primarily with peaksets, which are sets of genomic intervals representing
+candidate protein binding sites. Each interval consists of a chromosome, a start and end
+position, and usually a score of some type indicating confidence in, or strength of, the peak.
+Associated with each peakset are metadata relating to the experiment from which the peakset
+was derived. Additionally, files containing mapped sequencing reads (generally .bam files) can
+be associated with each peakset (one for the ChIP data, and optionally another representing
+a control sample)
+**Sample Information**
+You have to specify your sample information in the tool form above, where Condition contains the groups you want to compare.
+Example:
+============= ========== ========== ============= =============
+**SampleID** **Tissue** **Factor** **Condition** **Replicate**
+------------- ---------- ---------- ------------- -------------
+BT4741        BT474      ER         Resistant     1
+BT4742        BT474      ER         Resistant     2
+MCF71         MCF7       ER         Responsive    1
+MCF72         MCF7       ER         Responsive    2
+MCF73         MCF7       ER         Responsive    3
+T47D1         T47D       ER         Responsive    1
+T47D2         T47D       ER         Responsive    2
+MCF7r1        MCF7       ER         Resistant     1
+MCF7r2        MCF7       ER         Resistant     2
+ZR751         ZR75       ER         Responsive    1
+ZR752         ZR75       ER         Responsive    2
+============= ========== ========== ============= =============
+**Peak files**
+Result of your Peak calling experiment in bed format, one file for each sample is required.
+Example:
+======= ======= ======= =============== =======
+1          2      3          4           **5**
+======= ======= ======= =============== =======
+chr18   215562  216063  MACS_peak_16037 56.11
+chr18   311530  312105  MACS_peak_16038 222.49
+chr18   356656  357315  MACS_peak_16039 92.06
+chr18   371110  372092  MACS_peak_16040 123.86
+chr18   395116  396464  MACS_peak_16041 1545.39
+chr18   399014  400382  MACS_peak_16042 1835.19
+chr18   499134  500200  MACS_peak_16043 748.32
+chr18   503518  504552  MACS_peak_16044 818.30
+chr18   531672  532274  MACS_peak_16045 159.30
+chr18   568326  569282  MACS_peak_16046 601.11
+======= ======= ======= =============== =======
+* BAM file which contains the mapped sequencing reads can be associated with each peakset
+* Control BAM file represents a control dataset and are optional, but have to specified for all when used.
+-----
+**Outputs**
+This tool outputs
+* differentially bound sites in BED, WIG or GFF format
+Optionally, under **Output Options** you can choose to output
+* a correlation heatmap plot
+* a binding affinity matrix
+* an RData file
+**Differentially Bound Sites**
+As output format you can choose BED, GFF, WIG.
+Example - BED format:
+=====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+1      2       3       4     5     6       7       8       9       10          **11**
+=====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+chr18  394600  396513  1914    *   7.15    7.89    5.55    2.35    7.06e-24    9.84e-21
+chr18  111567  112005  439     *   5.71    3.63    6.53    -2.89   1.27e-08    8.88e-06
+chr18  346464  347342  879     *   5       3.24    5.77    -2.52   6.51e-06    0.00303
+chr18  399014  400382  1369    *   7.62    8.05    7       1.04    1.04e-05    0.00364
+chr18  371110  372102  993     *   4.63    5.36    3.07    2.3     8.1e-05     0.0226
+=====  ======  ======  ===== ====  ====    ====    ====    =====   ========    ========
+Columns contain the following data:
+* **1st**: Chromosome name
+* **2nd**: Start position of site
+* **3rd**: End position of site
+* **4th**: Length of site
+* **5th**: Strand
+* **6th**: Mean read concentration over all the samples (the default calculation uses log2 normalized ChIP read counts with control read counts subtracted)
+* **7th**: Mean concentration over the first (e.g. Resistant) group
+* **8th**: Mean concentration over second (e.g. Responsive) group
+* **9th**: Fold shows the difference in mean concentrations between the two groups (e.g. Resistant - Responsive), with a positive value indicating increased binding affinity in the first group and a negative value indicating increased binding affinity in the second group.
+* **10th**: P-value confidence measure for identifying these sites as differentially bound
+* **11th**: a multiple testing corrected FDR p-value
+**Binding Affinity Matrix**
+The final result of counting is a binding affinity matrix containing a (normalized) read count for each sample at every potential binding site. With this matrix, the samples can be re-clustered using affinity, rather than occupancy, data. The binding affinity matrix can be used for QC plotting as well as for subsequent
+differential analysis.
+Example:
+====== ====== ====== ========== ========== ========= ====== ========= ====
+ID     Tissue Factor Condition  Treatment  Replicate Caller Intervals FRiP
+====== ====== ====== ========== ========== ========= ====== ========= ====
+BT4741 BT474  ER     Resistant  Full-Media 1         counts 2845      0.16
+BT4742 BT474  ER     Resistant  Full-Media 2         counts 2845      0.15
+MCF71  MCF7   ER     Responsive Full-Media 1         counts 2845      0.27
+MCF72  MCF7   ER     Responsive Full-Media 2         counts 2845      0.17
+MCF73  MCF7   ER     Responsive Full-Media 3         counts 2845      0.23
+T47D1  T47D   ER     Responsive Full-Media 1         counts 2845      0.10
+T47D2  T47D   ER     Responsive Full-Media 2         counts 2845      0.06
+MCF7r1 MCF7   ER     Resistant  Full-Media 1         counts 2845      0.20
+MCF7r2 MCF7   ER     Resistant  Full-Media 2         counts 2845      0.13
+ZR751  ZR75   ER     Responsive Full-Media 1         counts 2845      0.32
+ZR752  ZR75   ER     Responsive Full-Media 2         counts 2845      0.22
+====== ====== ====== ========== ========== ========= ====== ========= ====
+-----
+**More Information**
+Generally, processing data with DiffBind involves five phases:
+#. Reading in peaksets
+#. Occupancy analysis
+#. Counting reads
+#. Differential binding affinity analysis
+#. Plotting and reporting
+**Reading in peaksets**:
+The first step is to read in a set of peaksets and associated
+metadata. Peaksets are derived either from ChIP-Seq peak callers, such as **MACS2**, or using some other criterion (e.g. genomic windows, or all the promoter regions
+in a genome).  A single experiment can have more than
+one associated peakset; e.g. if multiple peak callers are used for comparison purposes
+each sample would have more than one line in the sample sheet. Once the peaksets
+are read in, a merging function finds all overlapping peaks and derives a single set of
+unique genomic intervals covering all the supplied peaks (a consensus peakset for the
+experiment).
+**Occupancy analysis**:
+Peaksets, especially those generated by peak callers, provide
+an insight into the potential occupancy of the protein being ChIPed for at specific
+genomic loci. After the peaksets have been loaded, it can be useful to perform some
+exploratory plotting to determine how these occupancy maps agree with each other,
+e.g. between experimental replicates (re-doing the ChIP under the same conditions),
+between different peak callers on the same experiment, and within groups of samples
+representing a common experimental condition. DiffBind provides functions to enable
+overlaps to be examined, as well as functions to determine how well similar samples
+cluster together. Beyond quality control, the product of an occupancy analysis may be
+a consensus peakset, representing an overall set of candidate binding sites to be used
+in further analysis.
+**Counting reads**:
+Once a consensus peakset has been derived, DiffBind can use the
+supplied sequence read files to count how many reads overlap each interval for each
+unique sample. The peaks in the consensus peakset may be re-centered and trimmed
+based on calculating their summits (point of greatest read overlap) in order to provide
+more standardized peak intervals. The final result of counting is a binding affinity matrix
+containing a (normalized) read count for each sample at every potential binding site.
+With this matrix, the samples can be re-clustered using affinity, rather than occupancy,
+data. The binding affinity matrix is used for QC plotting as well as for subsequent
+differential analysis.
+**Differential binding affinity analysis**:
+The core functionality of DiffBind is the
+differential binding affinity analysis, which enables binding sites to be identified that
+are statistically significantly differentially bound between sample groups. To accomplish
+this, first a contrast (or contrasts) is established, dividing the samples into groups to
+be compared. Next the core analysis routines are executed, by default using DESeq2 .
+This will assign a p-value and FDR to each candidate binding site indicating confidence
+that they are differentially bound.
+**Plotting and reporting**:
+Once one or more contrasts have been run, DiffBind provides
+a number of functions for reporting and plotting the results. MA plots give an
+overview of the results of the analysis, while correlation heatmaps and PCA plots show
+how the groups cluster based on differentially bound sites. Boxplots show the distribution
+of reads within differentially bound sites corresponding to whether they gain or
+lose affinity between the two sample groups. A reporting mechanism enables differentially
+bound sites to be extracted for further processing, such as annotation, motif, and
+pathway analyses. *Note that currently only the correlation plot is implemented in this Galaxy tool.*
+-----
+**References**
+DiffBind Authors:  Rory Stark, Gordon Brown (2011)
+Wrapper authors: Bjoern Gruening, Pavankumar Videm
+]]>
+</help>
+<citations>
+<citation type="doi">doi:10.1038/nature10730</citation>
+</citations>
+</tool>

Mercurial > repos > iuc > diffbind

comparison diffbind.xml @ 0:18090d836604 draft default tip