annotate genetrack.xml @ 21:c868ac2145c4 draft default tip

Uploaded
author greg
date Wed, 16 Dec 2015 20:11:17 -0500
parents 2f0dede41f69
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
1 <?xml version="1.0"?>
11
497e3274f70b Uploaded
greg
parents: 6
diff changeset
2 <tool id="genetrack" name="GeneTrack" version="@WRAPPER_VERSION@.0">
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
3 <description>peak predictor</description>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
4 <macros>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
5 <import>genetrack_macros.xml</import>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
6 </macros>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
7 <expand macro="requirements" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
8 <command>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
9 python $__tool_directory__/genetrack.py
0368815ae4d5 Uploaded
greg
parents:
diff changeset
10 --input_format $input_format_cond.input_format
6
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
11 #if str($input_format_cond.input_format) == "scidx":
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
12 #for $i in $input_format_cond.input_scidx:
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
13 --input "${i}" "${i.hid}"
0368815ae4d5 Uploaded
greg
parents:
diff changeset
14 #end for
0368815ae4d5 Uploaded
greg
parents:
diff changeset
15 #elif str($input_format_cond.input_format) == "gff":
0368815ae4d5 Uploaded
greg
parents:
diff changeset
16 #for $i in $input_format_cond.input_gff:
0368815ae4d5 Uploaded
greg
parents:
diff changeset
17 --input "${i}" "${i.hid}"
0368815ae4d5 Uploaded
greg
parents:
diff changeset
18 #end for
0368815ae4d5 Uploaded
greg
parents:
diff changeset
19 #end if
0368815ae4d5 Uploaded
greg
parents:
diff changeset
20 --sigma $sigma
0368815ae4d5 Uploaded
greg
parents:
diff changeset
21 --exclusion $exclusion
0368815ae4d5 Uploaded
greg
parents:
diff changeset
22 --up_width $up_width
0368815ae4d5 Uploaded
greg
parents:
diff changeset
23 --down_width $down_width
0368815ae4d5 Uploaded
greg
parents:
diff changeset
24 --filter $filter
0368815ae4d5 Uploaded
greg
parents:
diff changeset
25 </command>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
26 <inputs>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
27 <conditional name="input_format_cond">
0368815ae4d5 Uploaded
greg
parents:
diff changeset
28 <param name="input_format" type="select" label="Format of files for conversion">
6
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
29 <option value="scidx" selected="True">ScIdx</option>
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
30 <option value="gff">Gff</option>
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
31 </param>
6
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
32 <when value="scidx">
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
33 <param name="input_scidx" type="data" format="scidx" multiple="True" label="Predict peaks on" />
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
34 </when>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
35 <when value="gff">
0368815ae4d5 Uploaded
greg
parents:
diff changeset
36 <param name="input_gff" type="data" format="gff" multiple="True" label="Predict peaks on" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
37 </when>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
38 </conditional>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
39 <param name="sigma" type="integer" value="5" min="1" label="Sigma to use when smoothing reads" help="Higher values increase computation but produce more smoothing." />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
40 <param name="exclusion" type="integer" value="20" min="1" label="Peak exclusion zone" help="Exclusion zone around each peak that prevents others from being called." />
12
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
41 <param name="up_width" type="integer" value="10" min="0" label="Exclusion zone of upstream called peaks" />
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
42 <param name="down_width" type="integer" value="10" min="0" label="Exclusion zone of downstream called peaks" />
15
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
43 <param name="filter" type="integer" value="1" min="0" label="Absolute read filter" help="Removes peaks with lower peak height." />
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
44 </inputs>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
45 <outputs>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
46 <collection name="genetrack_output" type="list" label="Genetrack results on ${on_string}">
0368815ae4d5 Uploaded
greg
parents:
diff changeset
47 <discover_datasets pattern="(?P&lt;designation&gt;.*)" directory="output" ext="gff" visible="false" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
48 </collection>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
49 </outputs>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
50 <tests>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
51 <test>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
52 <param name="input_gff" value="genetrack_input2.gff" ftype="gff" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
53 <param name="input_format" value="gff" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
54 <param name="sigma" value="5" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
55 <param name="exclusion" value="20" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
56 <param name="up_width" value="10" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
57 <param name="down_width" value="10" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
58 <param name="filter" value="3" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
59 <output_collection name="genetrack_output" type="list">
0368815ae4d5 Uploaded
greg
parents:
diff changeset
60 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output2.gff" ftype="gff" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
61 </output_collection>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
62 </test>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
63 <test>
6
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
64 <param name="input_scidx" value="genetrack_input3.scidx" ftype="scidx" />
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
65 <param name="input_format" value="scidx" />
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
66 <param name="sigma" value="5" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
67 <param name="exclusion" value="20" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
68 <param name="up_width" value="10" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
69 <param name="down_width" value="10" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
70 <param name="filter" value="3" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
71 <output_collection name="genetrack_output" type="list">
0368815ae4d5 Uploaded
greg
parents:
diff changeset
72 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output3.gff" ftype="gff" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
73 </output_collection>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
74 </test>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
75 <test>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
76 <param name="input_gff" value="genetrack_input_unsorted4.gff" ftype="gff" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
77 <param name="input_format" value="gff" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
78 <param name="sigma" value="5" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
79 <param name="exclusion" value="20" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
80 <param name="up_width" value="10" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
81 <param name="down_width" value="10" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
82 <param name="filter" value="3" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
83 <output_collection name="genetrack_output" type="list">
0368815ae4d5 Uploaded
greg
parents:
diff changeset
84 <element name="s5e20u10d10F3_on_data_1" file="genetrack_output4.gff" ftype="gff" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
85 </output_collection>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
86 </test>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
87 </tests>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
88 <help>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
89 **What it does**
0368815ae4d5 Uploaded
greg
parents:
diff changeset
90
15
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
91 GeneTrack separately identifies peaks on the forward "+” (W) and reverse “-” (C) strand. The way that GeneTrack
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
92 works is to replace each tag with a probabilistic distribution of occurrences for that tag at and around its mapped
12
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
93 genomic coordinate. The distance decay of the probabilistic distribution is set by adjusting the value of the
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
94 tool's **Sigma to use when smoothing reads** parameter. GeneTrack then sums the distribution over all mapped
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
95 tags. This results in a smooth continuous trace that can be globally broadened or tightened by adjusting the
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
96 sigma value. GeneTrack starts with the highest smoothed peak first, treating each strand separately if indicated
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
97 by the data, then sets up an exclusion zone (centered over the peak) defined by the value of the **Peak exclusion
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
98 zone** parameter (see figure). The exclusion zone prevents any secondary peaks from being called on the same strand
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
99 within that exclusion zone. In rare cases, it may be desirable to set different exclusion zones upstream (more 5’)
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
100 versus downstream (more 3’) of the peak.
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
101
15
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
102 .. image:: $PATH_TO_IMAGES/genetrack.png
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
103
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
104 GeneTrack continues through the data in order of peak height, until no other peaks are found, and in principle will
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
105 call a peak at a single isolated tag, if no filter is set using the tool's **Absolute read filter** parameter. A
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
106 filter value of 1 means that it will stop calling peaks when the tag count in the peak hits 1 (so single tag peaks
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
107 will be excluded in this case). GeneTrack outputs **chrom** (chromosome number), **strand** (+/W or -/C strand),
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
108 **start** (lower coordinate of exclusion zone), **end** (higher coordinate of exclusion zone), and **value** (peak
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
109 height). Genetrack's GFF output reports the start (lower coordinate) and end (higher coordinate) of the exclusion
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
110 zone.
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
111
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
112 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein plus
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
113 a steric exclusion zone between the protein and the exonuclease. On the other hand the site might be considerably
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
114 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
12
cd105fdfb0da Uploaded
greg
parents: 11
diff changeset
115
15
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
116 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
117 site size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20). For transcription factors
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
118 mapped by ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20). Sigma is typically
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
119 varied between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events. This may be
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
120 desirable if closely bound factors are not distinguishable. Too low of a sigma value will cause some tags that
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
121 contribute to a binding event to be excluded, because they may not be located sufficiently close to the main peak.
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
122 If alternative (mutually exclusive) binding is expected for two overlapping sites, and these sites are to be
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
123 independently recorded, then an empirically determined smaller exclusion zone width is set. Thus the value of sigma
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
124 is set empirically for each mapped factor, depending upon the resolution and binding site size of the binding event.
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
125
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
126 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
127 only a single coordinate (called Singletons, where stddev=0 in the output file). However, low coverage datasets might
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
128 be improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized
ebafcd6c3e0e Uploaded
greg
parents: 12
diff changeset
129 action of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
130
6
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
131 -----
fa85ca6c9cf8 Uploaded
greg
parents: 3
diff changeset
132
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
133 **Options**
0368815ae4d5 Uploaded
greg
parents:
diff changeset
134
16
b40ad4bee6cb Uploaded
greg
parents: 15
diff changeset
135 * **Sigma to use when smoothing reads** - Smooths clusters of tags via a Gaussian distribution.
b40ad4bee6cb Uploaded
greg
parents: 15
diff changeset
136 * **Peak exclusion zone** - Exclusion zone around each peak, eliminating all other peaks on the same strand that are within a ± bp distance of the peak.
b40ad4bee6cb Uploaded
greg
parents: 15
diff changeset
137 * **Exclusion zone of upstream called peaks** - Defines the exclusion zone centered over peaks upstream of a peak.
b40ad4bee6cb Uploaded
greg
parents: 15
diff changeset
138 * **Exclusion zone of downstream called peaks** - Defines the exclusion zone centered over peaks downstream of a peak.
b40ad4bee6cb Uploaded
greg
parents: 15
diff changeset
139 * **Filter** - Absolute read filter, restricts output to only peaks with larger peak height.
17
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
140
20
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
141 -----
17
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
142
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
143 **Output gff Columns**
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
144
21
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
145 1. Chromosome
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
146 2. Script
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
147 3. Placeholder (no meaning)
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
148 4. Start of peak exclusion zone (-e 20)
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
149 5. End of peak exclusion zone
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
150 6. Tag sum (not peak height or area under curve, which LionDB provides)
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
151 7. Strand
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
152 8. Placeholder (no meaning)
c868ac2145c4 Uploaded
greg
parents: 20
diff changeset
153 9. Attributes (standard deviation of reads located within exclusion zone) = fuzziness of peak
17
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
154
20
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
155 -----
17
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
156
20
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
157 **Considerations**
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
158
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
159 In principle, the width of the exclusion zone may be as large as the DNA region occupied by the native protein
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
160 plus a steric exclusion zone between the protein and the exonuclease. On the other hand the site might be considerably
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
161 smaller if the protein is in a denatured state during exonuclease digestion (since it is pre-treated with SDS).
17
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
162
20
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
163 In general, higher resolution data or smaller binding site size data should use smaller sigma values. Large binding site
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
164 size data such as 147 bp nucleosomal DNA use a larger sigma value like 20 (-s 20). For transcription factors mapped by
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
165 ChIP-exo, sigma may initially be set at 5, and the exclusion zone set at 20 (-s 5 –e 20). Sigma is typically varied
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
166 between ~3 and ~20. Too high of a sigma value may merge two independent nearby binding events. This may be desirable if
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
167 closely bound factors are not distinguishable. Too low of a sigma value will cause some tags that contribute to a binding
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
168 event to be excluded, because they may not be located sufficiently close to the main peak. If alternative (mutually
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
169 exclusive) binding is expected for two overlapping sites, and these sites are to be independently recorded, then an
2f0dede41f69 Uploaded
greg
parents: 19
diff changeset
170 empirically determined smaller exclusion zone width is set. Thus, the value of sigma is set empirically for each mappedfactor depending upon the resolution and binding site size of the binding event.
17
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
171
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
172 It might make sense to exclude peaks that have only a single tag, where -F 1 is used, or have their tags located on only
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
173 a single coordinate (called Singletons, where stddev=0 in the output file). However, low coverage datasets might be
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
174 improved by including them, if additional analysis (e.g., motif discovery) validates them. In addition, idealized action
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
175 of the exonuclease in ChIP-exo might place all tags for a peak on a single coordinate.
5a6ea187933b Uploaded
greg
parents: 16
diff changeset
176
0
0368815ae4d5 Uploaded
greg
parents:
diff changeset
177 </help>
0368815ae4d5 Uploaded
greg
parents:
diff changeset
178 <expand macro="citations" />
0368815ae4d5 Uploaded
greg
parents:
diff changeset
179 </tool>